diff options
Diffstat (limited to 'src')
178 files changed, 69668 insertions, 0 deletions
diff --git a/src/cache.c b/src/cache.c new file mode 100644 index 0000000..4f8ed4e --- /dev/null +++ b/src/cache.c @@ -0,0 +1,447 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <locale.h> +#include <limits.h> + +#include "common.h" +#include "cache.h" +#include "log.h" +#include "pl_thread.h" + +const struct pl_cache_params pl_cache_default_params = {0}; + +struct priv { + pl_log log; + pl_mutex lock; + PL_ARRAY(pl_cache_obj) objects; + size_t total_size; +}; + +int pl_cache_objects(pl_cache cache) +{ + if (!cache) + return 0; + + struct priv *p = PL_PRIV(cache); + pl_mutex_lock(&p->lock); + int num = p->objects.num; + pl_mutex_unlock(&p->lock); + return num; +} + +size_t pl_cache_size(pl_cache cache) +{ + if (!cache) + return 0; + + struct priv *p = PL_PRIV(cache); + pl_mutex_lock(&p->lock); + size_t size = p->total_size; + pl_mutex_unlock(&p->lock); + return size; +} + +pl_cache pl_cache_create(const struct pl_cache_params *params) +{ + struct pl_cache_t *cache = pl_zalloc_obj(NULL, cache, struct priv); + struct priv *p = PL_PRIV(cache); + pl_mutex_init(&p->lock); + if (params) { + cache->params = *params; + p->log = params->log; + } + + // Sanitize size limits + size_t total_size = PL_DEF(cache->params.max_total_size, SIZE_MAX); + size_t object_size = PL_DEF(cache->params.max_object_size, SIZE_MAX); + object_size = PL_MIN(total_size, object_size); + cache->params.max_total_size = total_size; + cache->params.max_object_size = object_size; + + return cache; +} + +static void remove_obj(pl_cache cache, pl_cache_obj obj) +{ + struct priv *p = PL_PRIV(cache); + + p->total_size -= obj.size; + if (obj.free) + obj.free(obj.data); +} + +void pl_cache_destroy(pl_cache *pcache) +{ + pl_cache cache = *pcache; + if (!cache) + return; + + struct priv *p = PL_PRIV(cache); + for (int i = 0; i < p->objects.num; i++) + remove_obj(cache, p->objects.elem[i]); + + pl_assert(p->total_size == 0); + pl_mutex_destroy(&p->lock); + pl_free((void *) cache); + *pcache = NULL; +} + +void pl_cache_reset(pl_cache cache) +{ + if (!cache) + return; + + struct priv *p = PL_PRIV(cache); + pl_mutex_lock(&p->lock); + for (int i = 0; i < p->objects.num; i++) + remove_obj(cache, p->objects.elem[i]); + p->objects.num = 0; + pl_assert(p->total_size == 0); + pl_mutex_unlock(&p->lock); +} + +static bool try_set(pl_cache cache, pl_cache_obj obj) +{ + struct priv *p = PL_PRIV(cache); + + // Remove any existing entry with this key + for (int i = p->objects.num - 1; i >= 0; i--) { + pl_cache_obj prev = p->objects.elem[i]; + if (prev.key == obj.key) { + PL_TRACE(p, "Removing out-of-date object 0x%"PRIx64, prev.key); + remove_obj(cache, prev); + PL_ARRAY_REMOVE_AT(p->objects, i); + break; + } + } + + if (!obj.size) { + PL_TRACE(p, "Deleted object 0x%"PRIx64, obj.key); + return true; + } + + if (obj.size > cache->params.max_object_size) { + PL_DEBUG(p, "Object 0x%"PRIx64" (size %zu) exceeds max size %zu, discarding", + obj.key, obj.size, cache->params.max_object_size); + return false; + } + + // Make space by deleting old objects + while (p->total_size + obj.size > cache->params.max_total_size || + p->objects.num == INT_MAX) + { + pl_assert(p->objects.num); + pl_cache_obj old = p->objects.elem[0]; + PL_TRACE(p, "Removing object 0x%"PRIx64" (size %zu) to make room", + old.key, old.size); + remove_obj(cache, old); + PL_ARRAY_REMOVE_AT(p->objects, 0); + } + + if (!obj.free) { + obj.data = pl_memdup(NULL, obj.data, obj.size); + obj.free = pl_free; + } + + PL_TRACE(p, "Inserting new object 0x%"PRIx64" (size %zu)", obj.key, obj.size); + PL_ARRAY_APPEND((void *) cache, p->objects, obj); + p->total_size += obj.size; + return true; +} + +static pl_cache_obj strip_obj(pl_cache_obj obj) +{ + return (pl_cache_obj) { .key = obj.key }; +} + +bool pl_cache_try_set(pl_cache cache, pl_cache_obj *pobj) +{ + if (!cache) + return false; + + pl_cache_obj obj = *pobj; + struct priv *p = PL_PRIV(cache); + pl_mutex_lock(&p->lock); + bool ok = try_set(cache, obj); + pl_mutex_unlock(&p->lock); + if (ok) { + *pobj = strip_obj(obj); // ownership transfers, clear ptr + } else { + obj = strip_obj(obj); // ownership remains with caller, clear copy + } + if (cache->params.set) + cache->params.set(cache->params.priv, obj); + return ok; +} + +void pl_cache_set(pl_cache cache, pl_cache_obj *obj) +{ + if (!pl_cache_try_set(cache, obj)) { + if (obj->free) + obj->free(obj->data); + *obj = (pl_cache_obj) { .key = obj->key }; + } +} + +static void noop(void *ignored) +{ + (void) ignored; +} + +bool pl_cache_get(pl_cache cache, pl_cache_obj *out_obj) +{ + const uint64_t key = out_obj->key; + if (!cache) + goto fail; + + struct priv *p = PL_PRIV(cache); + pl_mutex_lock(&p->lock); + + // Search backwards to prioritize recently added entries + for (int i = p->objects.num - 1; i >= 0; i--) { + pl_cache_obj obj = p->objects.elem[i]; + if (obj.key == key) { + PL_ARRAY_REMOVE_AT(p->objects, i); + p->total_size -= obj.size; + pl_mutex_unlock(&p->lock); + pl_assert(obj.free); + *out_obj = obj; + return true; + } + } + + pl_mutex_unlock(&p->lock); + if (!cache->params.get) + goto fail; + + pl_cache_obj obj = cache->params.get(cache->params.priv, key); + if (!obj.size) + goto fail; + + // Sanitize object + obj.key = key; + obj.free = PL_DEF(obj.free, noop); + *out_obj = obj; + return true; + +fail: + *out_obj = (pl_cache_obj) { .key = key }; + return false; +} + +void pl_cache_iterate(pl_cache cache, + void (*cb)(void *priv, pl_cache_obj obj), + void *priv) +{ + if (!cache) + return; + + struct priv *p = PL_PRIV(cache); + pl_mutex_lock(&p->lock); + for (int i = 0; i < p->objects.num; i++) + cb(priv, p->objects.elem[i]); + pl_mutex_unlock(&p->lock); +} + +// --- Saving/loading + +#define CACHE_MAGIC "pl_cache" +#define CACHE_VERSION 1 +#define PAD_ALIGN(x) PL_ALIGN2(x, sizeof(uint32_t)) + +struct __attribute__((__packed__)) cache_header { + char magic[8]; + uint32_t version; + uint32_t num_entries; +}; + +struct __attribute__((__packed__)) cache_entry { + uint64_t key; + uint64_t size; + uint64_t hash; +}; + +pl_static_assert(sizeof(struct cache_header) % alignof(struct cache_entry) == 0); + +int pl_cache_save_ex(pl_cache cache, + void (*write)(void *priv, size_t size, const void *ptr), + void *priv) +{ + if (!cache) + return 0; + + struct priv *p = PL_PRIV(cache); + pl_mutex_lock(&p->lock); + pl_clock_t start = pl_clock_now(); + + const int num_objects = p->objects.num; + const size_t saved_bytes = p->total_size; + write(priv, sizeof(struct cache_header), &(struct cache_header) { + .magic = CACHE_MAGIC, + .version = CACHE_VERSION, + .num_entries = num_objects, + }); + + for (int i = 0; i < num_objects; i++) { + pl_cache_obj obj = p->objects.elem[i]; + PL_TRACE(p, "Saving object 0x%"PRIx64" (size %zu)", obj.key, obj.size); + write(priv, sizeof(struct cache_entry), &(struct cache_entry) { + .key = obj.key, + .size = obj.size, + .hash = pl_mem_hash(obj.data, obj.size), + }); + static const uint8_t padding[PAD_ALIGN(1)] = {0}; + write(priv, obj.size, obj.data); + write(priv, PAD_ALIGN(obj.size) - obj.size, padding); + } + + pl_mutex_unlock(&p->lock); + pl_log_cpu_time(p->log, start, pl_clock_now(), "saving cache"); + if (num_objects) + PL_DEBUG(p, "Saved %d objects, totalling %zu bytes", num_objects, saved_bytes); + + return num_objects; +} + +int pl_cache_load_ex(pl_cache cache, + bool (*read)(void *priv, size_t size, void *ptr), + void *priv) +{ + if (!cache) + return 0; + + struct priv *p = PL_PRIV(cache); + struct cache_header header; + if (!read(priv, sizeof(header), &header)) { + PL_ERR(p, "Failed loading cache: file seems empty or truncated"); + return -1; + } + if (memcmp(header.magic, CACHE_MAGIC, sizeof(header.magic)) != 0) { + PL_ERR(p, "Failed loading cache: invalid magic bytes"); + return -1; + } + if (header.version != CACHE_VERSION) { + PL_INFO(p, "Failed loading cache: wrong version... skipping"); + return 0; + } + if (header.num_entries > INT_MAX) { + PL_ERR(p, "Failed loading cache: %"PRIu32" entries overflows int", + header.num_entries); + return 0; + } + + int num_loaded = 0; + size_t loaded_bytes = 0; + pl_mutex_lock(&p->lock); + pl_clock_t start = pl_clock_now(); + + for (int i = 0; i < header.num_entries; i++) { + struct cache_entry entry; + if (!read(priv, sizeof(entry), &entry)) { + PL_WARN(p, "Cache seems truncated, missing objects.. ignoring rest"); + goto error; + } + + if (entry.size > SIZE_MAX) { + PL_WARN(p, "Cache object size %"PRIu64" overflows SIZE_MAX.. " + "suspect broken file, ignoring rest", entry.size); + goto error; + } + + void *buf = pl_alloc(NULL, PAD_ALIGN(entry.size)); + if (!read(priv, PAD_ALIGN(entry.size), buf)) { + PL_WARN(p, "Cache seems truncated, missing objects.. ignoring rest"); + pl_free(buf); + goto error; + } + + uint64_t checksum = pl_mem_hash(buf, entry.size); + if (checksum != entry.hash) { + PL_WARN(p, "Cache entry seems corrupt, checksum mismatch.. ignoring rest"); + pl_free(buf); + goto error; + } + + pl_cache_obj obj = { + .key = entry.key, + .size = entry.size, + .data = buf, + .free = pl_free, + }; + + PL_TRACE(p, "Loading object 0x%"PRIx64" (size %zu)", obj.key, obj.size); + if (try_set(cache, obj)) { + num_loaded++; + loaded_bytes += entry.size; + } else { + pl_free(buf); + } + } + + pl_log_cpu_time(p->log, start, pl_clock_now(), "loading cache"); + if (num_loaded) + PL_DEBUG(p, "Loaded %d objects, totalling %zu bytes", num_loaded, loaded_bytes); + + // fall through +error: + pl_mutex_unlock(&p->lock); + return num_loaded; +} + +// Save/load wrappers + +struct ptr_ctx { + uint8_t *data; // base pointer + size_t size; // total size + size_t pos; // read/write index +}; + +static void write_ptr(void *priv, size_t size, const void *ptr) +{ + struct ptr_ctx *ctx = priv; + size_t end = PL_MIN(ctx->pos + size, ctx->size); + if (end > ctx->pos) + memcpy(ctx->data + ctx->pos, ptr, end - ctx->pos); + ctx->pos += size; +} + +static bool read_ptr(void *priv, size_t size, void *ptr) +{ + struct ptr_ctx *ctx = priv; + if (ctx->pos + size > ctx->size) + return false; + memcpy(ptr, ctx->data + ctx->pos, size); + ctx->pos += size; + return true; +} + +size_t pl_cache_save(pl_cache cache, uint8_t *data, size_t size) +{ + struct ptr_ctx ctx = { data, size }; + pl_cache_save_ex(cache, write_ptr, &ctx); + return ctx.pos; +} + +int pl_cache_load(pl_cache cache, const uint8_t *data, size_t size) +{ + return pl_cache_load_ex(cache, read_ptr, &(struct ptr_ctx) { + .data = (uint8_t *) data, + .size = size, + }); +} diff --git a/src/cache.h b/src/cache.h new file mode 100644 index 0000000..7e0ff2f --- /dev/null +++ b/src/cache.h @@ -0,0 +1,72 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" +#include "hash.h" + +#include <libplacebo/cache.h> + +// Convenience wrapper around pl_cache_set +static inline void pl_cache_str(pl_cache cache, uint64_t key, pl_str *str) +{ + pl_cache_set(cache, &(pl_cache_obj) { + .key = key, + .data = pl_steal(NULL, str->buf), + .size = str->len, + .free = pl_free, + }); + *str = (pl_str) {0}; +} + +// Steal and insert a cache object +static inline void pl_cache_steal(pl_cache cache, pl_cache_obj *obj) +{ + if (obj->free == pl_free) + obj->data = pl_steal(NULL, obj->data); + pl_cache_set(cache, obj); +} + +// Resize `obj->data` to a given size, re-using allocated buffers where possible +static inline void pl_cache_obj_resize(void *alloc, pl_cache_obj *obj, size_t size) +{ + if (obj->free != pl_free) { + if (obj->free) + obj->free(obj->data); + obj->data = pl_alloc(alloc, size); + obj->free = pl_free; + } else if (pl_get_size(obj->data) < size) { + obj->data = pl_steal(alloc, obj->data); + obj->data = pl_realloc(alloc, obj->data, size); + } + obj->size = size; +} + +// Internal list of base seeds for different object types, randomly generated + +enum { + CACHE_KEY_SH_LUT = UINT64_C(0x2206183d320352c6), // sh_lut cache + CACHE_KEY_ICC_3DLUT = UINT64_C(0xff703a6dd8a996f6), // ICC 3dlut + CACHE_KEY_DITHER = UINT64_C(0x6fed75eb6dce86cb), // dither matrix + CACHE_KEY_H274 = UINT64_C(0x2fb9adca04b42c4d), // H.274 film grain DB + CACHE_KEY_GAMUT_LUT = UINT64_C(0x6109e47f15d478b1), // gamut mapping 3DLUT + CACHE_KEY_SPIRV = UINT64_C(0x32352f6605ff60a7), // bare SPIR-V module + CACHE_KEY_VK_PIPE = UINT64_C(0x4bdab2817ad02ad4), // VkPipelineCache + CACHE_KEY_GL_PROG = UINT64_C(0x4274c309f4f0477b), // GL_ARB_get_program_binary + CACHE_KEY_D3D_DXBC = UINT64_C(0x807668516811d3bc), // DXBC bytecode +}; diff --git a/src/colorspace.c b/src/colorspace.c new file mode 100644 index 0000000..5cef2b5 --- /dev/null +++ b/src/colorspace.c @@ -0,0 +1,1609 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" +#include "hash.h" + +#include <libplacebo/colorspace.h> +#include <libplacebo/tone_mapping.h> + +bool pl_color_system_is_ycbcr_like(enum pl_color_system sys) +{ + switch (sys) { + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + case PL_COLOR_SYSTEM_XYZ: + return false; + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_BT_2020_C: + case PL_COLOR_SYSTEM_BT_2100_PQ: + case PL_COLOR_SYSTEM_BT_2100_HLG: + case PL_COLOR_SYSTEM_DOLBYVISION: + case PL_COLOR_SYSTEM_YCGCO: + return true; + case PL_COLOR_SYSTEM_COUNT: break; + }; + + pl_unreachable(); +} + +bool pl_color_system_is_linear(enum pl_color_system sys) +{ + switch (sys) { + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_YCGCO: + return true; + case PL_COLOR_SYSTEM_BT_2020_C: + case PL_COLOR_SYSTEM_BT_2100_PQ: + case PL_COLOR_SYSTEM_BT_2100_HLG: + case PL_COLOR_SYSTEM_DOLBYVISION: + case PL_COLOR_SYSTEM_XYZ: + return false; + case PL_COLOR_SYSTEM_COUNT: break; + }; + + pl_unreachable(); +} + +enum pl_color_system pl_color_system_guess_ycbcr(int width, int height) +{ + if (width >= 1280 || height > 576) { + // Typical HD content + return PL_COLOR_SYSTEM_BT_709; + } else { + // Typical SD content + return PL_COLOR_SYSTEM_BT_601; + } +} + +bool pl_bit_encoding_equal(const struct pl_bit_encoding *b1, + const struct pl_bit_encoding *b2) +{ + return b1->sample_depth == b2->sample_depth && + b1->color_depth == b2->color_depth && + b1->bit_shift == b2->bit_shift; +} + +const struct pl_color_repr pl_color_repr_unknown = {0}; + +const struct pl_color_repr pl_color_repr_rgb = { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_FULL, +}; + +const struct pl_color_repr pl_color_repr_sdtv = { + .sys = PL_COLOR_SYSTEM_BT_601, + .levels = PL_COLOR_LEVELS_LIMITED, +}; + +const struct pl_color_repr pl_color_repr_hdtv = { + .sys = PL_COLOR_SYSTEM_BT_709, + .levels = PL_COLOR_LEVELS_LIMITED, +}; + +const struct pl_color_repr pl_color_repr_uhdtv = { + .sys = PL_COLOR_SYSTEM_BT_2020_NC, + .levels = PL_COLOR_LEVELS_LIMITED, +}; + +const struct pl_color_repr pl_color_repr_jpeg = { + .sys = PL_COLOR_SYSTEM_BT_601, + .levels = PL_COLOR_LEVELS_FULL, +}; + +bool pl_color_repr_equal(const struct pl_color_repr *c1, + const struct pl_color_repr *c2) +{ + return c1->sys == c2->sys && + c1->levels == c2->levels && + c1->alpha == c2->alpha && + c1->dovi == c2->dovi && + pl_bit_encoding_equal(&c1->bits, &c2->bits); +} + +static struct pl_bit_encoding pl_bit_encoding_merge(const struct pl_bit_encoding *orig, + const struct pl_bit_encoding *new) +{ + return (struct pl_bit_encoding) { + .sample_depth = PL_DEF(orig->sample_depth, new->sample_depth), + .color_depth = PL_DEF(orig->color_depth, new->color_depth), + .bit_shift = PL_DEF(orig->bit_shift, new->bit_shift), + }; +} + +void pl_color_repr_merge(struct pl_color_repr *orig, const struct pl_color_repr *new) +{ + *orig = (struct pl_color_repr) { + .sys = PL_DEF(orig->sys, new->sys), + .levels = PL_DEF(orig->levels, new->levels), + .alpha = PL_DEF(orig->alpha, new->alpha), + .dovi = PL_DEF(orig->dovi, new->dovi), + .bits = pl_bit_encoding_merge(&orig->bits, &new->bits), + }; +} + +enum pl_color_levels pl_color_levels_guess(const struct pl_color_repr *repr) +{ + if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) + return PL_COLOR_LEVELS_FULL; + + if (repr->levels) + return repr->levels; + + return pl_color_system_is_ycbcr_like(repr->sys) + ? PL_COLOR_LEVELS_LIMITED + : PL_COLOR_LEVELS_FULL; +} + +float pl_color_repr_normalize(struct pl_color_repr *repr) +{ + float scale = 1.0; + struct pl_bit_encoding *bits = &repr->bits; + + if (bits->bit_shift) { + scale /= (1LL << bits->bit_shift); + bits->bit_shift = 0; + } + + // If one of these is set but not the other, use the set one + int tex_bits = PL_DEF(bits->sample_depth, 8); + int col_bits = PL_DEF(bits->color_depth, tex_bits); + tex_bits = PL_DEF(tex_bits, col_bits); + + if (pl_color_levels_guess(repr) == PL_COLOR_LEVELS_LIMITED) { + // Limit range is always shifted directly + scale *= (float) (1LL << tex_bits) / (1LL << col_bits); + } else { + // Full range always uses the full range available + scale *= ((1LL << tex_bits) - 1.) / ((1LL << col_bits) - 1.); + } + + bits->color_depth = bits->sample_depth; + return scale; +} + +bool pl_color_primaries_is_wide_gamut(enum pl_color_primaries prim) +{ + switch (prim) { + case PL_COLOR_PRIM_UNKNOWN: + case PL_COLOR_PRIM_BT_601_525: + case PL_COLOR_PRIM_BT_601_625: + case PL_COLOR_PRIM_BT_709: + case PL_COLOR_PRIM_BT_470M: + case PL_COLOR_PRIM_EBU_3213: + return false; + case PL_COLOR_PRIM_BT_2020: + case PL_COLOR_PRIM_APPLE: + case PL_COLOR_PRIM_ADOBE: + case PL_COLOR_PRIM_PRO_PHOTO: + case PL_COLOR_PRIM_CIE_1931: + case PL_COLOR_PRIM_DCI_P3: + case PL_COLOR_PRIM_DISPLAY_P3: + case PL_COLOR_PRIM_V_GAMUT: + case PL_COLOR_PRIM_S_GAMUT: + case PL_COLOR_PRIM_FILM_C: + case PL_COLOR_PRIM_ACES_AP0: + case PL_COLOR_PRIM_ACES_AP1: + return true; + case PL_COLOR_PRIM_COUNT: break; + } + + pl_unreachable(); +} + +enum pl_color_primaries pl_color_primaries_guess(int width, int height) +{ + // HD content + if (width >= 1280 || height > 576) + return PL_COLOR_PRIM_BT_709; + + switch (height) { + case 576: // Typical PAL content, including anamorphic/squared + return PL_COLOR_PRIM_BT_601_625; + + case 480: // Typical NTSC content, including squared + case 486: // NTSC Pro or anamorphic NTSC + return PL_COLOR_PRIM_BT_601_525; + + default: // No good metric, just pick BT.709 to minimize damage + return PL_COLOR_PRIM_BT_709; + } +} + +// HLG 75% value (scene-referred) +#define HLG_75 3.17955 + +float pl_color_transfer_nominal_peak(enum pl_color_transfer trc) +{ + switch (trc) { + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_GAMMA18: + case PL_COLOR_TRC_GAMMA20: + case PL_COLOR_TRC_GAMMA22: + case PL_COLOR_TRC_GAMMA24: + case PL_COLOR_TRC_GAMMA26: + case PL_COLOR_TRC_GAMMA28: + case PL_COLOR_TRC_PRO_PHOTO: + case PL_COLOR_TRC_ST428: + return 1.0; + case PL_COLOR_TRC_PQ: return 10000.0 / PL_COLOR_SDR_WHITE; + case PL_COLOR_TRC_HLG: return 12.0 / HLG_75; + case PL_COLOR_TRC_V_LOG: return 46.0855; + case PL_COLOR_TRC_S_LOG1: return 6.52; + case PL_COLOR_TRC_S_LOG2: return 9.212; + case PL_COLOR_TRC_COUNT: break; + } + + pl_unreachable(); +} + +const struct pl_hdr_metadata pl_hdr_metadata_empty = {0}; +const struct pl_hdr_metadata pl_hdr_metadata_hdr10 ={ + .prim = { + .red = {0.708, 0.292}, + .green = {0.170, 0.797}, + .blue = {0.131, 0.046}, + .white = {0.31271, 0.32902}, + }, + .min_luma = 0, + .max_luma = 10000, + .max_cll = 10000, + .max_fall = 0, // unknown +}; + +static const float PQ_M1 = 2610./4096 * 1./4, + PQ_M2 = 2523./4096 * 128, + PQ_C1 = 3424./4096, + PQ_C2 = 2413./4096 * 32, + PQ_C3 = 2392./4096 * 32; + +float pl_hdr_rescale(enum pl_hdr_scaling from, enum pl_hdr_scaling to, float x) +{ + if (from == to) + return x; + if (!x) // micro-optimization for common value + return x; + + x = fmaxf(x, 0.0f); + + // Convert input to PL_SCALE_RELATIVE + switch (from) { + case PL_HDR_PQ: + x = powf(x, 1.0f / PQ_M2); + x = fmaxf(x - PQ_C1, 0.0f) / (PQ_C2 - PQ_C3 * x); + x = powf(x, 1.0f / PQ_M1); + x *= 10000.0f; + // fall through + case PL_HDR_NITS: + x /= PL_COLOR_SDR_WHITE; + // fall through + case PL_HDR_NORM: + goto output; + case PL_HDR_SQRT: + x *= x; + goto output; + case PL_HDR_SCALING_COUNT: + break; + } + + pl_unreachable(); + +output: + // Convert PL_SCALE_RELATIVE to output + switch (to) { + case PL_HDR_NORM: + return x; + case PL_HDR_SQRT: + return sqrtf(x); + case PL_HDR_NITS: + return x * PL_COLOR_SDR_WHITE; + case PL_HDR_PQ: + x *= PL_COLOR_SDR_WHITE / 10000.0f; + x = powf(x, PQ_M1); + x = (PQ_C1 + PQ_C2 * x) / (1.0f + PQ_C3 * x); + x = powf(x, PQ_M2); + return x; + case PL_HDR_SCALING_COUNT: + break; + } + + pl_unreachable(); +} + +static inline bool pl_hdr_bezier_equal(const struct pl_hdr_bezier *a, + const struct pl_hdr_bezier *b) +{ + return a->target_luma == b->target_luma && + a->knee_x == b->knee_x && + a->knee_y == b->knee_y && + a->num_anchors == b->num_anchors && + !memcmp(a->anchors, b->anchors, sizeof(a->anchors[0]) * a->num_anchors); +} + +bool pl_hdr_metadata_equal(const struct pl_hdr_metadata *a, + const struct pl_hdr_metadata *b) +{ + return pl_raw_primaries_equal(&a->prim, &b->prim) && + a->min_luma == b->min_luma && + a->max_luma == b->max_luma && + a->max_cll == b->max_cll && + a->max_fall == b->max_fall && + a->scene_max[0] == b->scene_max[0] && + a->scene_max[1] == b->scene_max[1] && + a->scene_max[2] == b->scene_max[2] && + a->scene_avg == b->scene_avg && + pl_hdr_bezier_equal(&a->ootf, &b->ootf) && + a->max_pq_y == b->max_pq_y && + a->avg_pq_y == b->avg_pq_y; +} + +void pl_hdr_metadata_merge(struct pl_hdr_metadata *orig, + const struct pl_hdr_metadata *update) +{ + pl_raw_primaries_merge(&orig->prim, &update->prim); + if (!orig->min_luma) + orig->min_luma = update->min_luma; + if (!orig->max_luma) + orig->max_luma = update->max_luma; + if (!orig->max_cll) + orig->max_cll = update->max_cll; + if (!orig->max_fall) + orig->max_fall = update->max_fall; + if (!orig->scene_max[1]) + memcpy(orig->scene_max, update->scene_max, sizeof(orig->scene_max)); + if (!orig->scene_avg) + orig->scene_avg = update->scene_avg; + if (!orig->ootf.target_luma) + orig->ootf = update->ootf; + if (!orig->max_pq_y) + orig->max_pq_y = update->max_pq_y; + if (!orig->avg_pq_y) + orig->avg_pq_y = update->avg_pq_y; +} + +bool pl_hdr_metadata_contains(const struct pl_hdr_metadata *data, + enum pl_hdr_metadata_type type) +{ + bool has_hdr10 = data->max_luma; + bool has_hdr10plus = data->scene_avg && (data->scene_max[0] || + data->scene_max[1] || + data->scene_max[2]); + bool has_cie_y = data->max_pq_y && data->avg_pq_y; + + switch (type) { + case PL_HDR_METADATA_NONE: return true; + case PL_HDR_METADATA_ANY: return has_hdr10 || has_hdr10plus || has_cie_y; + case PL_HDR_METADATA_HDR10: return has_hdr10; + case PL_HDR_METADATA_HDR10PLUS: return has_hdr10plus; + case PL_HDR_METADATA_CIE_Y: return has_cie_y; + case PL_HDR_METADATA_TYPE_COUNT: break; + } + + pl_unreachable(); +} + +const struct pl_color_space pl_color_space_unknown = {0}; + +const struct pl_color_space pl_color_space_srgb = { + .primaries = PL_COLOR_PRIM_BT_709, + .transfer = PL_COLOR_TRC_SRGB, +}; + +const struct pl_color_space pl_color_space_bt709 = { + .primaries = PL_COLOR_PRIM_BT_709, + .transfer = PL_COLOR_TRC_BT_1886, +}; + +const struct pl_color_space pl_color_space_hdr10 = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_PQ, +}; + +const struct pl_color_space pl_color_space_bt2020_hlg = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_HLG, +}; + +const struct pl_color_space pl_color_space_monitor = { + .primaries = PL_COLOR_PRIM_BT_709, // sRGB primaries + .transfer = PL_COLOR_TRC_UNKNOWN, // unknown SDR response +}; + +bool pl_color_space_is_hdr(const struct pl_color_space *csp) +{ + return csp->hdr.max_luma > PL_COLOR_SDR_WHITE || + pl_color_transfer_is_hdr(csp->transfer); +} + +bool pl_color_space_is_black_scaled(const struct pl_color_space *csp) +{ + switch (csp->transfer) { + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_GAMMA18: + case PL_COLOR_TRC_GAMMA20: + case PL_COLOR_TRC_GAMMA22: + case PL_COLOR_TRC_GAMMA24: + case PL_COLOR_TRC_GAMMA26: + case PL_COLOR_TRC_GAMMA28: + case PL_COLOR_TRC_PRO_PHOTO: + case PL_COLOR_TRC_ST428: + case PL_COLOR_TRC_HLG: + return true; + + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_PQ: + case PL_COLOR_TRC_V_LOG: + case PL_COLOR_TRC_S_LOG1: + case PL_COLOR_TRC_S_LOG2: + return false; + + case PL_COLOR_TRC_COUNT: break; + } + + pl_unreachable(); +} + +void pl_color_space_merge(struct pl_color_space *orig, + const struct pl_color_space *new) +{ + if (!orig->primaries) + orig->primaries = new->primaries; + if (!orig->transfer) + orig->transfer = new->transfer; + pl_hdr_metadata_merge(&orig->hdr, &new->hdr); +} + +bool pl_color_space_equal(const struct pl_color_space *c1, + const struct pl_color_space *c2) +{ + return c1->primaries == c2->primaries && + c1->transfer == c2->transfer && + pl_hdr_metadata_equal(&c1->hdr, &c2->hdr); +} + +// Estimates luminance from maxRGB by looking at how monochromatic MaxSCL is +static void luma_from_maxrgb(const struct pl_color_space *csp, + enum pl_hdr_scaling scaling, + float *out_max, float *out_avg) +{ + const float maxscl = PL_MAX3(csp->hdr.scene_max[0], + csp->hdr.scene_max[1], + csp->hdr.scene_max[2]); + if (!maxscl) + return; + + struct pl_raw_primaries prim = csp->hdr.prim; + pl_raw_primaries_merge(&prim, pl_raw_primaries_get(csp->primaries)); + const pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(&prim); + + const float max_luma = rgb2xyz.m[1][0] * csp->hdr.scene_max[0] + + rgb2xyz.m[1][1] * csp->hdr.scene_max[1] + + rgb2xyz.m[1][2] * csp->hdr.scene_max[2]; + + const float coef = max_luma / maxscl; + *out_max = pl_hdr_rescale(PL_HDR_NITS, scaling, max_luma); + *out_avg = pl_hdr_rescale(PL_HDR_NITS, scaling, coef * csp->hdr.scene_avg); +} + +static inline bool metadata_compat(enum pl_hdr_metadata_type metadata, + enum pl_hdr_metadata_type compat) +{ + return metadata == PL_HDR_METADATA_ANY || metadata == compat; +} + +void pl_color_space_nominal_luma_ex(const struct pl_nominal_luma_params *params) +{ + if (!params || (!params->out_min && !params->out_max && !params->out_avg)) + return; + + const struct pl_color_space *csp = params->color; + const enum pl_hdr_scaling scaling = params->scaling; + + float min_luma = 0, max_luma = 0, avg_luma = 0; + if (params->metadata != PL_HDR_METADATA_NONE) { + // Initialize from static HDR10 metadata, in all cases + min_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, csp->hdr.min_luma); + max_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, csp->hdr.max_luma); + } + + if (metadata_compat(params->metadata, PL_HDR_METADATA_HDR10PLUS) && + pl_hdr_metadata_contains(&csp->hdr, PL_HDR_METADATA_HDR10PLUS)) + { + luma_from_maxrgb(csp, scaling, &max_luma, &avg_luma); + } + + if (metadata_compat(params->metadata, PL_HDR_METADATA_CIE_Y) && + pl_hdr_metadata_contains(&csp->hdr, PL_HDR_METADATA_CIE_Y)) + { + max_luma = pl_hdr_rescale(PL_HDR_PQ, scaling, csp->hdr.max_pq_y); + avg_luma = pl_hdr_rescale(PL_HDR_PQ, scaling, csp->hdr.avg_pq_y); + } + + // Clamp to sane value range + const float hdr_min = pl_hdr_rescale(PL_HDR_NITS, scaling, PL_COLOR_HDR_BLACK); + const float hdr_max = pl_hdr_rescale(PL_HDR_PQ, scaling, 1.0f); + max_luma = max_luma ? PL_CLAMP(max_luma, hdr_min, hdr_max) : 0; + min_luma = min_luma ? PL_CLAMP(min_luma, hdr_min, hdr_max) : 0; + if ((max_luma && min_luma >= max_luma) || min_luma >= hdr_max) + min_luma = max_luma = 0; // sanity + + // PQ is always scaled down to absolute black, ignoring HDR metadata + if (csp->transfer == PL_COLOR_TRC_PQ) + min_luma = hdr_min; + + // Baseline/fallback metadata, inferred entirely from the colorspace + // description and built-in default assumptions + if (!max_luma) { + if (csp->transfer == PL_COLOR_TRC_HLG) { + max_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, PL_COLOR_HLG_PEAK); + } else { + const float peak = pl_color_transfer_nominal_peak(csp->transfer); + max_luma = pl_hdr_rescale(PL_HDR_NORM, scaling, peak); + } + } + + if (!min_luma) { + if (pl_color_transfer_is_hdr(csp->transfer)) { + min_luma = hdr_min; + } else { + const float peak = pl_hdr_rescale(scaling, PL_HDR_NITS, max_luma); + min_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, + peak / PL_COLOR_SDR_CONTRAST); + } + } + + if (avg_luma) + avg_luma = PL_CLAMP(avg_luma, min_luma, max_luma); // sanity + + if (params->out_min) + *params->out_min = min_luma; + if (params->out_max) + *params->out_max = max_luma; + if (params->out_avg) + *params->out_avg = avg_luma; +} + +void pl_color_space_nominal_luma(const struct pl_color_space *csp, + float *out_min, float *out_max) +{ + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = csp, + .metadata = PL_HDR_METADATA_ANY, + .scaling = PL_HDR_NORM, + .out_min = out_min, + .out_max = out_max, + )); +} + +void pl_color_space_infer(struct pl_color_space *space) +{ + if (!space->primaries) + space->primaries = PL_COLOR_PRIM_BT_709; + if (!space->transfer) + space->transfer = PL_COLOR_TRC_BT_1886; + + // Sanitize the static HDR metadata + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = space, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_NITS, + .out_max = &space->hdr.max_luma, + // Preserve tagged minimum + .out_min = space->hdr.min_luma ? NULL : &space->hdr.min_luma, + )); + + // Default the signal color space based on the nominal raw primaries + if (!pl_primaries_valid(&space->hdr.prim)) + space->hdr.prim = *pl_raw_primaries_get(space->primaries); +} + +static void infer_both_ref(struct pl_color_space *space, + struct pl_color_space *ref) +{ + pl_color_space_infer(ref); + + if (!space->primaries) { + if (pl_color_primaries_is_wide_gamut(ref->primaries)) { + space->primaries = PL_COLOR_PRIM_BT_709; + } else { + space->primaries = ref->primaries; + } + } + + if (!space->transfer) { + switch (ref->transfer) { + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_COUNT: + pl_unreachable(); + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_GAMMA22: + // Re-use input transfer curve to avoid small adaptations + space->transfer = ref->transfer; + break; + case PL_COLOR_TRC_PQ: + case PL_COLOR_TRC_HLG: + case PL_COLOR_TRC_V_LOG: + case PL_COLOR_TRC_S_LOG1: + case PL_COLOR_TRC_S_LOG2: + // Pick BT.1886 model because it models SDR contrast accurately, + // and we need contrast information for tone mapping + space->transfer = PL_COLOR_TRC_BT_1886; + break; + case PL_COLOR_TRC_PRO_PHOTO: + // ProPhotoRGB and sRGB are both piecewise with linear slope + space->transfer = PL_COLOR_TRC_SRGB; + break; + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_GAMMA18: + case PL_COLOR_TRC_GAMMA20: + case PL_COLOR_TRC_GAMMA24: + case PL_COLOR_TRC_GAMMA26: + case PL_COLOR_TRC_GAMMA28: + case PL_COLOR_TRC_ST428: + // Pick pure power output curve to avoid introducing black crush + space->transfer = PL_COLOR_TRC_GAMMA22; + break; + } + } + + // Infer the remaining fields after making the above choices + pl_color_space_infer(space); +} + +void pl_color_space_infer_ref(struct pl_color_space *space, + const struct pl_color_space *refp) +{ + // Make a copy of `refp` to infer missing values first + struct pl_color_space ref = *refp; + infer_both_ref(space, &ref); +} + +void pl_color_space_infer_map(struct pl_color_space *src, + struct pl_color_space *dst) +{ + bool unknown_src_contrast = !src->hdr.min_luma; + bool unknown_dst_contrast = !dst->hdr.min_luma; + + infer_both_ref(dst, src); + + // If the src has an unspecified gamma curve with dynamic black scaling, + // default it to match the dst colorspace contrast. This does not matter in + // most cases, but ensures that BT.1886 is tuned to the appropriate black + // point by default. + bool dynamic_src_contrast = pl_color_space_is_black_scaled(src) || + src->transfer == PL_COLOR_TRC_BT_1886; + if (unknown_src_contrast && dynamic_src_contrast) + src->hdr.min_luma = dst->hdr.min_luma; + + // Do the same in reverse if both src and dst are SDR curves + bool src_is_sdr = !pl_color_space_is_hdr(src); + bool dst_is_sdr = !pl_color_space_is_hdr(dst); + if (unknown_dst_contrast && src_is_sdr && dst_is_sdr) + dst->hdr.min_luma = src->hdr.min_luma; + + // If the src is HLG and the output is HDR, tune the HLG peak to the output + if (src->transfer == PL_COLOR_TRC_HLG && pl_color_space_is_hdr(dst)) + src->hdr.max_luma = dst->hdr.max_luma; +} + +const struct pl_color_adjustment pl_color_adjustment_neutral = { + PL_COLOR_ADJUSTMENT_NEUTRAL +}; + +void pl_chroma_location_offset(enum pl_chroma_location loc, float *x, float *y) +{ + *x = *y = 0; + + // This is the majority of subsampled chroma content out there + loc = PL_DEF(loc, PL_CHROMA_LEFT); + + switch (loc) { + case PL_CHROMA_LEFT: + case PL_CHROMA_TOP_LEFT: + case PL_CHROMA_BOTTOM_LEFT: + *x = -0.5; + break; + default: break; + } + + switch (loc) { + case PL_CHROMA_TOP_LEFT: + case PL_CHROMA_TOP_CENTER: + *y = -0.5; + break; + default: break; + } + + switch (loc) { + case PL_CHROMA_BOTTOM_LEFT: + case PL_CHROMA_BOTTOM_CENTER: + *y = 0.5; + break; + default: break; + } +} + +struct pl_cie_xy pl_white_from_temp(float temp) +{ + temp = PL_CLAMP(temp, 2500, 25000); + + double ti = 1000.0 / temp, ti2 = ti * ti, ti3 = ti2 * ti, x; + if (temp <= 7000) { + x = -4.6070 * ti3 + 2.9678 * ti2 + 0.09911 * ti + 0.244063; + } else { + x = -2.0064 * ti3 + 1.9018 * ti2 + 0.24748 * ti + 0.237040; + } + + return (struct pl_cie_xy) { + .x = x, + .y = -3 * (x*x) + 2.87 * x - 0.275, + }; +} + +bool pl_raw_primaries_equal(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b) +{ + return pl_cie_xy_equal(&a->red, &b->red) && + pl_cie_xy_equal(&a->green, &b->green) && + pl_cie_xy_equal(&a->blue, &b->blue) && + pl_cie_xy_equal(&a->white, &b->white); +} + +bool pl_raw_primaries_similar(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b) +{ + float delta = fabsf(a->red.x - b->red.x) + + fabsf(a->red.y - b->red.y) + + fabsf(a->green.x - b->green.x) + + fabsf(a->green.y - b->green.y) + + fabsf(a->blue.x - b->blue.x) + + fabsf(a->blue.y - b->blue.y) + + fabsf(a->white.x - b->white.x) + + fabsf(a->white.y - b->white.y); + + return delta < 0.001; +} + +void pl_raw_primaries_merge(struct pl_raw_primaries *orig, + const struct pl_raw_primaries *update) +{ + union { + struct pl_raw_primaries prim; + float raw[8]; + } *pa = (void *) orig, + *pb = (void *) update; + + pl_static_assert(sizeof(*pa) == sizeof(*orig)); + for (int i = 0; i < PL_ARRAY_SIZE(pa->raw); i++) + pa->raw[i] = PL_DEF(pa->raw[i], pb->raw[i]); +} + +const struct pl_raw_primaries *pl_raw_primaries_get(enum pl_color_primaries prim) +{ + /* + Values from: ITU-R Recommendations BT.470-6, BT.601-7, BT.709-5, BT.2020-0 + + https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.470-6-199811-S!!PDF-E.pdf + https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.601-7-201103-I!!PDF-E.pdf + https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.709-5-200204-I!!PDF-E.pdf + https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2020-0-201208-I!!PDF-E.pdf + + Other colorspaces from https://en.wikipedia.org/wiki/RGB_color_space#Specifications + */ + + // CIE standard illuminant series +#define CIE_D50 {0.3457, 0.3585} +#define CIE_D65 {0.3127, 0.3290} +#define CIE_C {0.3100, 0.3160} +#define CIE_E {1.0/3.0, 1.0/3.0} +#define DCI {0.3140, 0.3510} + + static const struct pl_raw_primaries primaries[] = { + [PL_COLOR_PRIM_BT_470M] = { + .red = {0.670, 0.330}, + .green = {0.210, 0.710}, + .blue = {0.140, 0.080}, + .white = CIE_C, + }, + + [PL_COLOR_PRIM_BT_601_525] = { + .red = {0.630, 0.340}, + .green = {0.310, 0.595}, + .blue = {0.155, 0.070}, + .white = CIE_D65, + }, + [PL_COLOR_PRIM_BT_601_625] = { + .red = {0.640, 0.330}, + .green = {0.290, 0.600}, + .blue = {0.150, 0.060}, + .white = CIE_D65, + }, + [PL_COLOR_PRIM_BT_709] = { + .red = {0.640, 0.330}, + .green = {0.300, 0.600}, + .blue = {0.150, 0.060}, + .white = CIE_D65, + }, + [PL_COLOR_PRIM_BT_2020] = { + .red = {0.708, 0.292}, + .green = {0.170, 0.797}, + .blue = {0.131, 0.046}, + .white = CIE_D65, + }, + [PL_COLOR_PRIM_APPLE] = { + .red = {0.625, 0.340}, + .green = {0.280, 0.595}, + .blue = {0.115, 0.070}, + .white = CIE_D65, + }, + [PL_COLOR_PRIM_ADOBE] = { + .red = {0.640, 0.330}, + .green = {0.210, 0.710}, + .blue = {0.150, 0.060}, + .white = CIE_D65, + }, + [PL_COLOR_PRIM_PRO_PHOTO] = { + .red = {0.7347, 0.2653}, + .green = {0.1596, 0.8404}, + .blue = {0.0366, 0.0001}, + .white = CIE_D50, + }, + [PL_COLOR_PRIM_CIE_1931] = { + .red = {0.7347, 0.2653}, + .green = {0.2738, 0.7174}, + .blue = {0.1666, 0.0089}, + .white = CIE_E, + }, + // From SMPTE RP 431-2 + [PL_COLOR_PRIM_DCI_P3] = { + .red = {0.680, 0.320}, + .green = {0.265, 0.690}, + .blue = {0.150, 0.060}, + .white = DCI, + }, + [PL_COLOR_PRIM_DISPLAY_P3] = { + .red = {0.680, 0.320}, + .green = {0.265, 0.690}, + .blue = {0.150, 0.060}, + .white = CIE_D65, + }, + // From Panasonic VARICAM reference manual + [PL_COLOR_PRIM_V_GAMUT] = { + .red = {0.730, 0.280}, + .green = {0.165, 0.840}, + .blue = {0.100, -0.03}, + .white = CIE_D65, + }, + // From Sony S-Log reference manual + [PL_COLOR_PRIM_S_GAMUT] = { + .red = {0.730, 0.280}, + .green = {0.140, 0.855}, + .blue = {0.100, -0.05}, + .white = CIE_D65, + }, + // From FFmpeg source code + [PL_COLOR_PRIM_FILM_C] = { + .red = {0.681, 0.319}, + .green = {0.243, 0.692}, + .blue = {0.145, 0.049}, + .white = CIE_C, + }, + [PL_COLOR_PRIM_EBU_3213] = { + .red = {0.630, 0.340}, + .green = {0.295, 0.605}, + .blue = {0.155, 0.077}, + .white = CIE_D65, + }, + // From Wikipedia + [PL_COLOR_PRIM_ACES_AP0] = { + .red = {0.7347, 0.2653}, + .green = {0.0000, 1.0000}, + .blue = {0.0001, -0.0770}, + .white = {0.32168, 0.33767}, + }, + [PL_COLOR_PRIM_ACES_AP1] = { + .red = {0.713, 0.293}, + .green = {0.165, 0.830}, + .blue = {0.128, 0.044}, + .white = {0.32168, 0.33767}, + }, + }; + + // This is the default assumption if no colorspace information could + // be determined, eg. for files which have no video channel. + if (!prim) + prim = PL_COLOR_PRIM_BT_709; + + pl_assert(prim < PL_ARRAY_SIZE(primaries)); + return &primaries[prim]; +} + +// Compute the RGB/XYZ matrix as described here: +// http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html +pl_matrix3x3 pl_get_rgb2xyz_matrix(const struct pl_raw_primaries *prim) +{ + pl_matrix3x3 out = {{{0}}}; + float S[3], X[4], Z[4]; + + X[0] = pl_cie_X(prim->red); + X[1] = pl_cie_X(prim->green); + X[2] = pl_cie_X(prim->blue); + X[3] = pl_cie_X(prim->white); + + Z[0] = pl_cie_Z(prim->red); + Z[1] = pl_cie_Z(prim->green); + Z[2] = pl_cie_Z(prim->blue); + Z[3] = pl_cie_Z(prim->white); + + // S = XYZ^-1 * W + for (int i = 0; i < 3; i++) { + out.m[0][i] = X[i]; + out.m[1][i] = 1; + out.m[2][i] = Z[i]; + } + + pl_matrix3x3_invert(&out); + + for (int i = 0; i < 3; i++) + S[i] = out.m[i][0] * X[3] + out.m[i][1] * 1 + out.m[i][2] * Z[3]; + + // M = [Sc * XYZc] + for (int i = 0; i < 3; i++) { + out.m[0][i] = S[i] * X[i]; + out.m[1][i] = S[i] * 1; + out.m[2][i] = S[i] * Z[i]; + } + + return out; +} + +pl_matrix3x3 pl_get_xyz2rgb_matrix(const struct pl_raw_primaries *prim) +{ + // For simplicity, just invert the rgb2xyz matrix + pl_matrix3x3 out = pl_get_rgb2xyz_matrix(prim); + pl_matrix3x3_invert(&out); + return out; +} + +// LMS<-XYZ revised matrix from CIECAM97, based on a linear transform and +// normalized for equal energy on monochrome inputs +static const pl_matrix3x3 m_cat97 = {{ + { 0.8562, 0.3372, -0.1934 }, + { -0.8360, 1.8327, 0.0033 }, + { 0.0357, -0.0469, 1.0112 }, +}}; + +// M := M * XYZd<-XYZs +static void apply_chromatic_adaptation(struct pl_cie_xy src, + struct pl_cie_xy dest, + pl_matrix3x3 *mat) +{ + // If the white points are nearly identical, this is a wasteful identity + // operation. + if (fabs(src.x - dest.x) < 1e-6 && fabs(src.y - dest.y) < 1e-6) + return; + + // XYZd<-XYZs = Ma^-1 * (I*[Cd/Cs]) * Ma + // http://www.brucelindbloom.com/index.html?Eqn_ChromAdapt.html + // For Ma, we use the CIECAM97 revised (linear) matrix + float C[3][2]; + + for (int i = 0; i < 3; i++) { + // source cone + C[i][0] = m_cat97.m[i][0] * pl_cie_X(src) + + m_cat97.m[i][1] * 1 + + m_cat97.m[i][2] * pl_cie_Z(src); + + // dest cone + C[i][1] = m_cat97.m[i][0] * pl_cie_X(dest) + + m_cat97.m[i][1] * 1 + + m_cat97.m[i][2] * pl_cie_Z(dest); + } + + // tmp := I * [Cd/Cs] * Ma + pl_matrix3x3 tmp = {0}; + for (int i = 0; i < 3; i++) + tmp.m[i][i] = C[i][1] / C[i][0]; + + pl_matrix3x3_mul(&tmp, &m_cat97); + + // M := M * Ma^-1 * tmp + pl_matrix3x3 ma_inv = m_cat97; + pl_matrix3x3_invert(&ma_inv); + pl_matrix3x3_mul(mat, &ma_inv); + pl_matrix3x3_mul(mat, &tmp); +} + +pl_matrix3x3 pl_get_adaptation_matrix(struct pl_cie_xy src, struct pl_cie_xy dst) +{ + // Use BT.709 primaries (with chosen white point) as an XYZ reference + struct pl_raw_primaries csp = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709); + csp.white = src; + + pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(&csp); + pl_matrix3x3 xyz2rgb = rgb2xyz; + pl_matrix3x3_invert(&xyz2rgb); + + apply_chromatic_adaptation(src, dst, &xyz2rgb); + pl_matrix3x3_mul(&xyz2rgb, &rgb2xyz); + return xyz2rgb; +} + +pl_matrix3x3 pl_ipt_rgb2lms(const struct pl_raw_primaries *prim) +{ + static const pl_matrix3x3 hpe = {{ // HPE XYZ->LMS (D65) method + { 0.40024f, 0.70760f, -0.08081f }, + { -0.22630f, 1.16532f, 0.04570f }, + { 0.00000f, 0.00000f, 0.91822f }, + }}; + + const float c = 0.04; // 4% crosstalk + pl_matrix3x3 m = {{ + { 1 - 2*c, c, c }, + { c, 1 - 2*c, c }, + { c, c, 1 - 2*c }, + }}; + + pl_matrix3x3_mul(&m, &hpe); + + // Apply chromatic adaptation to D65 if the input white point differs + static const struct pl_cie_xy d65 = CIE_D65; + apply_chromatic_adaptation(prim->white, d65, &m); + + const pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(prim); + pl_matrix3x3_mul(&m, &rgb2xyz); + return m; +} + +pl_matrix3x3 pl_ipt_lms2rgb(const struct pl_raw_primaries *prim) +{ + pl_matrix3x3 m = pl_ipt_rgb2lms(prim); + pl_matrix3x3_invert(&m); + return m; +} + +// As standardized in Ebner & Fairchild IPT (1998) +const pl_matrix3x3 pl_ipt_lms2ipt = {{ + { 0.4000, 0.4000, 0.2000 }, + { 4.4550, -4.8510, 0.3960 }, + { 0.8056, 0.3572, -1.1628 }, +}}; + +// Numerically inverted from the matrix above +const pl_matrix3x3 pl_ipt_ipt2lms = {{ + { 1.0, 0.0975689, 0.205226 }, + { 1.0, -0.1138760, 0.133217 }, + { 1.0, 0.0326151, -0.676887 }, +}}; + +const struct pl_cone_params pl_vision_normal = {PL_CONE_NONE, 1.0}; +const struct pl_cone_params pl_vision_protanomaly = {PL_CONE_L, 0.5}; +const struct pl_cone_params pl_vision_protanopia = {PL_CONE_L, 0.0}; +const struct pl_cone_params pl_vision_deuteranomaly = {PL_CONE_M, 0.5}; +const struct pl_cone_params pl_vision_deuteranopia = {PL_CONE_M, 0.0}; +const struct pl_cone_params pl_vision_tritanomaly = {PL_CONE_S, 0.5}; +const struct pl_cone_params pl_vision_tritanopia = {PL_CONE_S, 0.0}; +const struct pl_cone_params pl_vision_monochromacy = {PL_CONE_LM, 0.0}; +const struct pl_cone_params pl_vision_achromatopsia = {PL_CONE_LMS, 0.0}; + +pl_matrix3x3 pl_get_cone_matrix(const struct pl_cone_params *params, + const struct pl_raw_primaries *prim) +{ + // LMS<-RGB := LMS<-XYZ * XYZ<-RGB + pl_matrix3x3 rgb2lms = m_cat97; + pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(prim); + pl_matrix3x3_mul(&rgb2lms, &rgb2xyz); + + // LMS versions of the two opposing primaries, plus neutral + float lms_r[3] = {1.0, 0.0, 0.0}, + lms_b[3] = {0.0, 0.0, 1.0}, + lms_w[3] = {1.0, 1.0, 1.0}; + + pl_matrix3x3_apply(&rgb2lms, lms_r); + pl_matrix3x3_apply(&rgb2lms, lms_b); + pl_matrix3x3_apply(&rgb2lms, lms_w); + + float a, b, c = params->strength; + pl_matrix3x3 distort; + + switch (params->cones) { + case PL_CONE_NONE: + return pl_matrix3x3_identity; + + case PL_CONE_L: + // Solve to preserve neutral and blue + a = (lms_b[0] - lms_b[2] * lms_w[0] / lms_w[2]) / + (lms_b[1] - lms_b[2] * lms_w[1] / lms_w[2]); + b = (lms_b[0] - lms_b[1] * lms_w[0] / lms_w[1]) / + (lms_b[2] - lms_b[1] * lms_w[2] / lms_w[1]); + assert(fabs(a * lms_w[1] + b * lms_w[2] - lms_w[0]) < 1e-6); + + distort = (pl_matrix3x3) {{ + { c, (1.0 - c) * a, (1.0 - c) * b}, + { 0.0, 1.0, 0.0}, + { 0.0, 0.0, 1.0}, + }}; + break; + + case PL_CONE_M: + // Solve to preserve neutral and blue + a = (lms_b[1] - lms_b[2] * lms_w[1] / lms_w[2]) / + (lms_b[0] - lms_b[2] * lms_w[0] / lms_w[2]); + b = (lms_b[1] - lms_b[0] * lms_w[1] / lms_w[0]) / + (lms_b[2] - lms_b[0] * lms_w[2] / lms_w[0]); + assert(fabs(a * lms_w[0] + b * lms_w[2] - lms_w[1]) < 1e-6); + + distort = (pl_matrix3x3) {{ + { 1.0, 0.0, 0.0}, + {(1.0 - c) * a, c, (1.0 - c) * b}, + { 0.0, 0.0, 1.0}, + }}; + break; + + case PL_CONE_S: + // Solve to preserve neutral and red + a = (lms_r[2] - lms_r[1] * lms_w[2] / lms_w[1]) / + (lms_r[0] - lms_r[1] * lms_w[0] / lms_w[1]); + b = (lms_r[2] - lms_r[0] * lms_w[2] / lms_w[0]) / + (lms_r[1] - lms_r[0] * lms_w[1] / lms_w[0]); + assert(fabs(a * lms_w[0] + b * lms_w[1] - lms_w[2]) < 1e-6); + + distort = (pl_matrix3x3) {{ + { 1.0, 0.0, 0.0}, + { 0.0, 1.0, 0.0}, + {(1.0 - c) * a, (1.0 - c) * b, c}, + }}; + break; + + case PL_CONE_LM: + // Solve to preserve neutral + a = lms_w[0] / lms_w[2]; + b = lms_w[1] / lms_w[2]; + + distort = (pl_matrix3x3) {{ + { c, 0.0, (1.0 - c) * a}, + { 0.0, c, (1.0 - c) * b}, + { 0.0, 0.0, 1.0}, + }}; + break; + + case PL_CONE_MS: + // Solve to preserve neutral + a = lms_w[1] / lms_w[0]; + b = lms_w[2] / lms_w[0]; + + distort = (pl_matrix3x3) {{ + { 1.0, 0.0, 0.0}, + {(1.0 - c) * a, c, 0.0}, + {(1.0 - c) * b, 0.0, c}, + }}; + break; + + case PL_CONE_LS: + // Solve to preserve neutral + a = lms_w[0] / lms_w[1]; + b = lms_w[2] / lms_w[1]; + + distort = (pl_matrix3x3) {{ + { c, (1.0 - c) * a, 0.0}, + { 0.0, 1.0, 0.0}, + { 0.0, (1.0 - c) * b, c}, + }}; + break; + + case PL_CONE_LMS: { + // Rod cells only, which can be modelled somewhat as a combination of + // L and M cones. Either way, this is pushing the limits of the our + // color model, so this is only a rough approximation. + const float w[3] = {0.3605, 0.6415, -0.002}; + assert(fabs(w[0] + w[1] + w[2] - 1.0) < 1e-6); + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + distort.m[i][j] = (1.0 - c) * w[j] * lms_w[i] / lms_w[j]; + if (i == j) + distort.m[i][j] += c; + } + } + break; + } + + default: + pl_unreachable(); + } + + // out := RGB<-LMS * distort * LMS<-RGB + pl_matrix3x3 out = rgb2lms; + pl_matrix3x3_invert(&out); + pl_matrix3x3_mul(&out, &distort); + pl_matrix3x3_mul(&out, &rgb2lms); + + return out; +} + +pl_matrix3x3 pl_get_color_mapping_matrix(const struct pl_raw_primaries *src, + const struct pl_raw_primaries *dst, + enum pl_rendering_intent intent) +{ + // In saturation mapping, we don't care about accuracy and just want + // primaries to map to primaries, making this an identity transformation. + if (intent == PL_INTENT_SATURATION) + return pl_matrix3x3_identity; + + // RGBd<-RGBs = RGBd<-XYZd * XYZd<-XYZs * XYZs<-RGBs + // Equations from: http://www.brucelindbloom.com/index.html?Math.html + // Note: Perceptual is treated like relative colorimetric. There's no + // definition for perceptual other than "make it look good". + + // RGBd<-XYZd matrix + pl_matrix3x3 xyz2rgb_d = pl_get_xyz2rgb_matrix(dst); + + // Chromatic adaptation, except in absolute colorimetric intent + if (intent != PL_INTENT_ABSOLUTE_COLORIMETRIC) + apply_chromatic_adaptation(src->white, dst->white, &xyz2rgb_d); + + // XYZs<-RGBs + pl_matrix3x3 rgb2xyz_s = pl_get_rgb2xyz_matrix(src); + pl_matrix3x3_mul(&xyz2rgb_d, &rgb2xyz_s); + return xyz2rgb_d; +} + +// Test the sign of 'p' relative to the line 'ab' (barycentric coordinates) +static float test_point_line(const struct pl_cie_xy p, + const struct pl_cie_xy a, + const struct pl_cie_xy b) +{ + return (p.x - b.x) * (a.y - b.y) - (a.x - b.x) * (p.y - b.y); +} + +// Test if a point is entirely inside a gamut +static float test_point_gamut(struct pl_cie_xy point, + const struct pl_raw_primaries *prim) +{ + float d1 = test_point_line(point, prim->red, prim->green), + d2 = test_point_line(point, prim->green, prim->blue), + d3 = test_point_line(point, prim->blue, prim->red); + + bool has_neg = d1 < -1e-6f || d2 < -1e-6f || d3 < -1e-6f, + has_pos = d1 > 1e-6f || d2 > 1e-6f || d3 > 1e-6f; + + return !(has_neg && has_pos); +} + +bool pl_primaries_superset(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b) +{ + return test_point_gamut(b->red, a) && + test_point_gamut(b->green, a) && + test_point_gamut(b->blue, a); +} + +bool pl_primaries_valid(const struct pl_raw_primaries *prim) +{ + // Test to see if the primaries form a valid triangle (nonzero area) + float area = (prim->blue.x - prim->green.x) * (prim->red.y - prim->green.y) + - (prim->red.x - prim->green.x) * (prim->blue.y - prim->green.y); + + return fabs(area) > 1e-6 && test_point_gamut(prim->white, prim); +} + +static inline float xy_dist2(struct pl_cie_xy a, struct pl_cie_xy b) +{ + const float dx = a.x - b.x, dy = a.y - b.y; + return dx * dx + dy * dy; +} + +bool pl_primaries_compatible(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b) +{ + float RR = xy_dist2(a->red, b->red), RG = xy_dist2(a->red, b->green), + RB = xy_dist2(a->red, b->blue), GG = xy_dist2(a->green, b->green), + GB = xy_dist2(a->green, b->blue), BB = xy_dist2(a->blue, b->blue); + return RR < RG && RR < RB && GG < RG && GG < GB && BB < RB && BB < GB; +} + +// returns the intersection of the two lines defined by ab and cd +static struct pl_cie_xy intersection(struct pl_cie_xy a, struct pl_cie_xy b, + struct pl_cie_xy c, struct pl_cie_xy d) +{ + float det = (a.x - b.x) * (c.y - d.y) - (a.y - b.y) * (c.x - d.x); + float t = ((a.x - c.x) * (c.y - d.y) - (a.y - c.y) * (c.x - d.x)) / det; + return (struct pl_cie_xy) { + .x = t ? a.x + t * (b.x - a.x) : 0.0f, + .y = t ? a.y + t * (b.y - a.y) : 0.0f, + }; +} + +// x, y, z specified in clockwise order, with a, b, c being the enclosing gamut +static struct pl_cie_xy +clip_point(struct pl_cie_xy x, struct pl_cie_xy y, struct pl_cie_xy z, + struct pl_cie_xy a, struct pl_cie_xy b, struct pl_cie_xy c) +{ + const float d1 = test_point_line(y, a, b); + const float d2 = test_point_line(y, b, c); + if (d1 <= 0.0f && d2 <= 0.0f) { + return y; // already inside triangle + } else if (d1 > 0.0f && d2 > 0.0f) { + return b; // target vertex fully enclosed + } else if (d1 > 0.0f) { + return intersection(a, b, y, z); + } else { + return intersection(x, y, b, c); + } +} + +struct pl_raw_primaries pl_primaries_clip(const struct pl_raw_primaries *src, + const struct pl_raw_primaries *dst) +{ + return (struct pl_raw_primaries) { + .red = clip_point(src->green, src->red, src->blue, + dst->green, dst->red, dst->blue), + .green = clip_point(src->blue, src->green, src->red, + dst->blue, dst->green, dst->red), + .blue = clip_point(src->red, src->blue, src->green, + dst->red, dst->blue, dst->green), + .white = src->white, + }; +} + +/* Fill in the Y, U, V vectors of a yuv-to-rgb conversion matrix + * based on the given luma weights of the R, G and B components (lr, lg, lb). + * lr+lg+lb is assumed to equal 1. + * This function is meant for colorspaces satisfying the following + * conditions (which are true for common YUV colorspaces): + * - The mapping from input [Y, U, V] to output [R, G, B] is linear. + * - Y is the vector [1, 1, 1]. (meaning input Y component maps to 1R+1G+1B) + * - U maps to a value with zero R and positive B ([0, x, y], y > 0; + * i.e. blue and green only). + * - V maps to a value with zero B and positive R ([x, y, 0], x > 0; + * i.e. red and green only). + * - U and V are orthogonal to the luma vector [lr, lg, lb]. + * - The magnitudes of the vectors U and V are the minimal ones for which + * the image of the set Y=[0...1],U=[-0.5...0.5],V=[-0.5...0.5] under the + * conversion function will cover the set R=[0...1],G=[0...1],B=[0...1] + * (the resulting matrix can be converted for other input/output ranges + * outside this function). + * Under these conditions the given parameters lr, lg, lb uniquely + * determine the mapping of Y, U, V to R, G, B. + */ +static pl_matrix3x3 luma_coeffs(float lr, float lg, float lb) +{ + pl_assert(fabs(lr+lg+lb - 1) < 1e-6); + return (pl_matrix3x3) {{ + {1, 0, 2 * (1-lr) }, + {1, -2 * (1-lb) * lb/lg, -2 * (1-lr) * lr/lg }, + {1, 2 * (1-lb), 0 }, + }}; +} + +// Applies hue and saturation controls to a YCbCr->RGB matrix +static inline void apply_hue_sat(pl_matrix3x3 *m, + const struct pl_color_adjustment *params) +{ + // Hue is equivalent to rotating input [U, V] subvector around the origin. + // Saturation scales [U, V]. + float huecos = params->saturation * cos(params->hue); + float huesin = params->saturation * sin(params->hue); + for (int i = 0; i < 3; i++) { + float u = m->m[i][1], v = m->m[i][2]; + m->m[i][1] = huecos * u - huesin * v; + m->m[i][2] = huesin * u + huecos * v; + } +} + +pl_transform3x3 pl_color_repr_decode(struct pl_color_repr *repr, + const struct pl_color_adjustment *params) +{ + params = PL_DEF(params, &pl_color_adjustment_neutral); + + pl_matrix3x3 m; + switch (repr->sys) { + case PL_COLOR_SYSTEM_BT_709: m = luma_coeffs(0.2126, 0.7152, 0.0722); break; + case PL_COLOR_SYSTEM_BT_601: m = luma_coeffs(0.2990, 0.5870, 0.1140); break; + case PL_COLOR_SYSTEM_SMPTE_240M: m = luma_coeffs(0.2122, 0.7013, 0.0865); break; + case PL_COLOR_SYSTEM_BT_2020_NC: m = luma_coeffs(0.2627, 0.6780, 0.0593); break; + case PL_COLOR_SYSTEM_BT_2020_C: + // Note: This outputs into the [-0.5,0.5] range for chroma information. + m = (pl_matrix3x3) {{ + {0, 0, 1}, + {1, 0, 0}, + {0, 1, 0}, + }}; + break; + case PL_COLOR_SYSTEM_BT_2100_PQ: { + // Reversed from the matrix in the spec, hard-coded for efficiency + // and precision reasons. Exact values truncated from ITU-T H-series + // Supplement 18. + static const float lm_t = 0.008609, lm_p = 0.111029625; + m = (pl_matrix3x3) {{ + {1.0, lm_t, lm_p}, + {1.0, -lm_t, -lm_p}, + {1.0, 0.560031, -0.320627}, + }}; + break; + } + case PL_COLOR_SYSTEM_BT_2100_HLG: { + // Similar to BT.2100 PQ, exact values truncated from WolframAlpha + static const float lm_t = 0.01571858011, lm_p = 0.2095810681; + m = (pl_matrix3x3) {{ + {1.0, lm_t, lm_p}, + {1.0, -lm_t, -lm_p}, + {1.0, 1.02127108, -0.605274491}, + }}; + break; + } + case PL_COLOR_SYSTEM_DOLBYVISION: + m = repr->dovi->nonlinear; + break; + case PL_COLOR_SYSTEM_YCGCO: + m = (pl_matrix3x3) {{ + {1, -1, 1}, + {1, 1, 0}, + {1, -1, -1}, + }}; + break; + case PL_COLOR_SYSTEM_UNKNOWN: // fall through + case PL_COLOR_SYSTEM_RGB: + m = pl_matrix3x3_identity; + break; + case PL_COLOR_SYSTEM_XYZ: { + // For lack of anything saner to do, just assume the caller wants + // DCI-P3 primaries, which is a reasonable assumption. + const struct pl_raw_primaries *dst = pl_raw_primaries_get(PL_COLOR_PRIM_DCI_P3); + m = pl_get_xyz2rgb_matrix(dst); + // DCDM X'Y'Z' is expected to have equal energy white point (EG 432-1 Annex H) + apply_chromatic_adaptation((struct pl_cie_xy)CIE_E, dst->white, &m); + break; + } + case PL_COLOR_SYSTEM_COUNT: + pl_unreachable(); + } + + // Apply hue and saturation in the correct way depending on the colorspace. + if (pl_color_system_is_ycbcr_like(repr->sys)) { + apply_hue_sat(&m, params); + } else if (params->saturation != 1.0 || params->hue != 0.0) { + // Arbitrarily simulate hue shifts using the BT.709 YCbCr model + pl_matrix3x3 yuv2rgb = luma_coeffs(0.2126, 0.7152, 0.0722); + pl_matrix3x3 rgb2yuv = yuv2rgb; + pl_matrix3x3_invert(&rgb2yuv); + apply_hue_sat(&yuv2rgb, params); + // M := RGB<-YUV * YUV<-RGB * M + pl_matrix3x3_rmul(&rgb2yuv, &m); + pl_matrix3x3_rmul(&yuv2rgb, &m); + } + + // Apply color temperature adaptation, relative to BT.709 primaries + if (params->temperature) { + struct pl_cie_xy src = pl_white_from_temp(6500); + struct pl_cie_xy dst = pl_white_from_temp(6500 + 3500 * params->temperature); + pl_matrix3x3 adapt = pl_get_adaptation_matrix(src, dst); + pl_matrix3x3_rmul(&adapt, &m); + } + + pl_transform3x3 out = { .mat = m }; + int bit_depth = PL_DEF(repr->bits.sample_depth, + PL_DEF(repr->bits.color_depth, 8)); + + double ymax, ymin, cmax, cmid; + double scale = (1LL << bit_depth) / ((1LL << bit_depth) - 1.0); + + switch (pl_color_levels_guess(repr)) { + case PL_COLOR_LEVELS_LIMITED: { + ymax = 235 / 256. * scale; + ymin = 16 / 256. * scale; + cmax = 240 / 256. * scale; + cmid = 128 / 256. * scale; + break; + } + case PL_COLOR_LEVELS_FULL: + // Note: For full-range YUV, there are multiple, subtly inconsistent + // standards. So just pick the sanest implementation, which is to + // assume MAX_INT == 1.0. + ymax = 1.0; + ymin = 0.0; + cmax = 1.0; + cmid = 128 / 256. * scale; // *not* exactly 0.5 + break; + default: + pl_unreachable(); + } + + double ymul = 1.0 / (ymax - ymin); + double cmul = 0.5 / (cmax - cmid); + + double mul[3] = { ymul, ymul, ymul }; + double black[3] = { ymin, ymin, ymin }; + +#ifdef PL_HAVE_DOVI + if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) { + // The RPU matrix already includes levels normalization, but in this + // case we also have to respect the signalled color offsets + for (int i = 0; i < 3; i++) { + mul[i] = 1.0; + black[i] = repr->dovi->nonlinear_offset[i] * scale; + } + } else +#endif + if (pl_color_system_is_ycbcr_like(repr->sys)) { + mul[1] = mul[2] = cmul; + black[1] = black[2] = cmid; + } + + // Contrast scales the output value range (gain) + // Brightness scales the constant output bias (black lift/boost) + for (int i = 0; i < 3; i++) { + mul[i] *= params->contrast; + out.c[i] += params->brightness; + } + + // Multiply in the texture multiplier and adjust `c` so that black[j] keeps + // on mapping to RGB=0 (black to black) + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + out.mat.m[i][j] *= mul[j]; + out.c[i] -= out.mat.m[i][j] * black[j]; + } + } + + // Finally, multiply in the scaling factor required to get the color up to + // the correct representation. + pl_matrix3x3_scale(&out.mat, pl_color_repr_normalize(repr)); + + // Update the metadata to reflect the change. + repr->sys = PL_COLOR_SYSTEM_RGB; + repr->levels = PL_COLOR_LEVELS_FULL; + + return out; +} + +bool pl_icc_profile_equal(const struct pl_icc_profile *p1, + const struct pl_icc_profile *p2) +{ + if (p1->len != p2->len) + return false; + + // Ignore signatures on length-0 profiles, as a special case + return !p1->len || p1->signature == p2->signature; +} + +void pl_icc_profile_compute_signature(struct pl_icc_profile *profile) +{ + if (!profile->len) + profile->signature = 0; + + // In theory, we could get this value from the profile header itself if + // lcms is available, but I'm not sure if it's even worth the trouble. Just + // hard-code this to a pl_mem_hash(), which is decently fast anyway. + profile->signature = pl_mem_hash(profile->data, profile->len); +} diff --git a/src/common.c b/src/common.c new file mode 100644 index 0000000..8c8a4f0 --- /dev/null +++ b/src/common.c @@ -0,0 +1,500 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" +#include "version.h" + +#include <libplacebo/common.h> + +int pl_fix_ver(void) +{ + return BUILD_FIX_VER; +} + +const char *pl_version(void) +{ + return BUILD_VERSION; +} + +void pl_rect2d_normalize(pl_rect2d *rc) +{ + *rc = (pl_rect2d) { + .x0 = PL_MIN(rc->x0, rc->x1), + .x1 = PL_MAX(rc->x0, rc->x1), + .y0 = PL_MIN(rc->y0, rc->y1), + .y1 = PL_MAX(rc->y0, rc->y1), + }; +} + +void pl_rect3d_normalize(pl_rect3d *rc) +{ + *rc = (pl_rect3d) { + .x0 = PL_MIN(rc->x0, rc->x1), + .x1 = PL_MAX(rc->x0, rc->x1), + .y0 = PL_MIN(rc->y0, rc->y1), + .y1 = PL_MAX(rc->y0, rc->y1), + .z0 = PL_MIN(rc->z0, rc->z1), + .z1 = PL_MAX(rc->z0, rc->z1), + }; +} + +void pl_rect2df_normalize(pl_rect2df *rc) +{ + *rc = (pl_rect2df) { + .x0 = PL_MIN(rc->x0, rc->x1), + .x1 = PL_MAX(rc->x0, rc->x1), + .y0 = PL_MIN(rc->y0, rc->y1), + .y1 = PL_MAX(rc->y0, rc->y1), + }; +} + +void pl_rect3df_normalize(pl_rect3df *rc) +{ + *rc = (pl_rect3df) { + .x0 = PL_MIN(rc->x0, rc->x1), + .x1 = PL_MAX(rc->x0, rc->x1), + .y0 = PL_MIN(rc->y0, rc->y1), + .y1 = PL_MAX(rc->y0, rc->y1), + .z0 = PL_MIN(rc->z0, rc->z1), + .z1 = PL_MAX(rc->z0, rc->z1), + }; +} + +pl_rect2d pl_rect2df_round(const pl_rect2df *rc) +{ + return (pl_rect2d) { + .x0 = roundf(rc->x0), + .x1 = roundf(rc->x1), + .y0 = roundf(rc->y0), + .y1 = roundf(rc->y1), + }; +} + +pl_rect3d pl_rect3df_round(const pl_rect3df *rc) +{ + return (pl_rect3d) { + .x0 = roundf(rc->x0), + .x1 = roundf(rc->x1), + .y0 = roundf(rc->y0), + .y1 = roundf(rc->y1), + .z0 = roundf(rc->z0), + .z1 = roundf(rc->z1), + }; +} + +const pl_matrix3x3 pl_matrix3x3_identity = {{ + { 1, 0, 0 }, + { 0, 1, 0 }, + { 0, 0, 1 }, +}}; + +void pl_matrix3x3_apply(const pl_matrix3x3 *mat, float vec[3]) +{ + float x = vec[0], y = vec[1], z = vec[2]; + + for (int i = 0; i < 3; i++) + vec[i] = mat->m[i][0] * x + mat->m[i][1] * y + mat->m[i][2] * z; +} + +void pl_matrix3x3_apply_rc(const pl_matrix3x3 *mat, pl_rect3df *rc) +{ + float x0 = rc->x0, x1 = rc->x1, + y0 = rc->y0, y1 = rc->y1, + z0 = rc->z0, z1 = rc->z1; + + rc->x0 = mat->m[0][0] * x0 + mat->m[0][1] * y0 + mat->m[0][2] * z0; + rc->y0 = mat->m[1][0] * x0 + mat->m[1][1] * y0 + mat->m[1][2] * z0; + rc->z0 = mat->m[2][0] * x0 + mat->m[2][1] * y0 + mat->m[2][2] * z0; + + rc->x1 = mat->m[0][0] * x1 + mat->m[0][1] * y1 + mat->m[0][2] * z1; + rc->y1 = mat->m[1][0] * x1 + mat->m[1][1] * y1 + mat->m[1][2] * z1; + rc->z1 = mat->m[2][0] * x1 + mat->m[2][1] * y1 + mat->m[2][2] * z1; +} + +void pl_matrix3x3_scale(pl_matrix3x3 *mat, float scale) +{ + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) + mat->m[i][j] *= scale; + } +} + +void pl_matrix3x3_invert(pl_matrix3x3 *mat) +{ + double m00 = mat->m[0][0], m01 = mat->m[0][1], m02 = mat->m[0][2], + m10 = mat->m[1][0], m11 = mat->m[1][1], m12 = mat->m[1][2], + m20 = mat->m[2][0], m21 = mat->m[2][1], m22 = mat->m[2][2]; + + // calculate the adjoint + double a00 = (m11 * m22 - m21 * m12); + double a01 = -(m01 * m22 - m21 * m02); + double a02 = (m01 * m12 - m11 * m02); + double a10 = -(m10 * m22 - m20 * m12); + double a11 = (m00 * m22 - m20 * m02); + double a12 = -(m00 * m12 - m10 * m02); + double a20 = (m10 * m21 - m20 * m11); + double a21 = -(m00 * m21 - m20 * m01); + double a22 = (m00 * m11 - m10 * m01); + + // calculate the determinant (as inverse == 1/det * adjoint, + // adjoint * m == identity * det, so this calculates the det) + double det = m00 * a00 + m10 * a01 + m20 * a02; + det = 1.0 / det; + + mat->m[0][0] = det * a00; + mat->m[0][1] = det * a01; + mat->m[0][2] = det * a02; + mat->m[1][0] = det * a10; + mat->m[1][1] = det * a11; + mat->m[1][2] = det * a12; + mat->m[2][0] = det * a20; + mat->m[2][1] = det * a21; + mat->m[2][2] = det * a22; +} + +void pl_matrix3x3_mul(pl_matrix3x3 *a, const pl_matrix3x3 *b) +{ + float a00 = a->m[0][0], a01 = a->m[0][1], a02 = a->m[0][2], + a10 = a->m[1][0], a11 = a->m[1][1], a12 = a->m[1][2], + a20 = a->m[2][0], a21 = a->m[2][1], a22 = a->m[2][2]; + + for (int i = 0; i < 3; i++) { + a->m[0][i] = a00 * b->m[0][i] + a01 * b->m[1][i] + a02 * b->m[2][i]; + a->m[1][i] = a10 * b->m[0][i] + a11 * b->m[1][i] + a12 * b->m[2][i]; + a->m[2][i] = a20 * b->m[0][i] + a21 * b->m[1][i] + a22 * b->m[2][i]; + } +} + +void pl_matrix3x3_rmul(const pl_matrix3x3 *a, pl_matrix3x3 *b) +{ + pl_matrix3x3 m = *a; + pl_matrix3x3_mul(&m, b); + *b = m; +} + +const pl_transform3x3 pl_transform3x3_identity = { + .mat = {{ + { 1, 0, 0 }, + { 0, 1, 0 }, + { 0, 0, 1 }, + }}, +}; + +void pl_transform3x3_apply(const pl_transform3x3 *t, float vec[3]) +{ + pl_matrix3x3_apply(&t->mat, vec); + + for (int i = 0; i < 3; i++) + vec[i] += t->c[i]; +} + +void pl_transform3x3_apply_rc(const pl_transform3x3 *t, pl_rect3df *rc) +{ + pl_matrix3x3_apply_rc(&t->mat, rc); + + rc->x0 += t->c[0]; + rc->x1 += t->c[0]; + rc->y0 += t->c[1]; + rc->y1 += t->c[1]; + rc->z0 += t->c[2]; + rc->z1 += t->c[2]; +} + +void pl_transform3x3_scale(pl_transform3x3 *t, float scale) +{ + pl_matrix3x3_scale(&t->mat, scale); + + for (int i = 0; i < 3; i++) + t->c[i] *= scale; +} + +// based on DarkPlaces engine (relicensed from GPL to LGPL) +void pl_transform3x3_invert(pl_transform3x3 *t) +{ + pl_matrix3x3_invert(&t->mat); + + float m00 = t->mat.m[0][0], m01 = t->mat.m[0][1], m02 = t->mat.m[0][2], + m10 = t->mat.m[1][0], m11 = t->mat.m[1][1], m12 = t->mat.m[1][2], + m20 = t->mat.m[2][0], m21 = t->mat.m[2][1], m22 = t->mat.m[2][2]; + + // fix the constant coefficient + // rgb = M * yuv + C + // M^-1 * rgb = yuv + M^-1 * C + // yuv = M^-1 * rgb - M^-1 * C + // ^^^^^^^^^^ + float c0 = t->c[0], c1 = t->c[1], c2 = t->c[2]; + t->c[0] = -(m00 * c0 + m01 * c1 + m02 * c2); + t->c[1] = -(m10 * c0 + m11 * c1 + m12 * c2); + t->c[2] = -(m20 * c0 + m21 * c1 + m22 * c2); +} + +const pl_matrix2x2 pl_matrix2x2_identity = {{ + { 1, 0 }, + { 0, 1 }, +}}; + +pl_matrix2x2 pl_matrix2x2_rotation(float a) +{ + return (pl_matrix2x2) {{ + { cosf(a), -sinf(a) }, + { sinf(a), cosf(a) }, + }}; +} + +void pl_matrix2x2_apply(const pl_matrix2x2 *mat, float vec[2]) +{ + float x = vec[0], y = vec[1]; + + for (int i = 0; i < 2; i++) + vec[i] = mat->m[i][0] * x + mat->m[i][1] * y; +} + +void pl_matrix2x2_apply_rc(const pl_matrix2x2 *mat, pl_rect2df *rc) +{ + float x0 = rc->x0, x1 = rc->x1, + y0 = rc->y0, y1 = rc->y1; + + rc->x0 = mat->m[0][0] * x0 + mat->m[0][1] * y0; + rc->y0 = mat->m[1][0] * x0 + mat->m[1][1] * y0; + + rc->x1 = mat->m[0][0] * x1 + mat->m[0][1] * y1; + rc->y1 = mat->m[1][0] * x1 + mat->m[1][1] * y1; +} + +void pl_matrix2x2_mul(pl_matrix2x2 *a, const pl_matrix2x2 *b) +{ + float a00 = a->m[0][0], a01 = a->m[0][1], + a10 = a->m[1][0], a11 = a->m[1][1]; + + for (int i = 0; i < 2; i++) { + a->m[0][i] = a00 * b->m[0][i] + a01 * b->m[1][i]; + a->m[1][i] = a10 * b->m[0][i] + a11 * b->m[1][i]; + } +} + +void pl_matrix2x2_rmul(const pl_matrix2x2 *a, pl_matrix2x2 *b) +{ + pl_matrix2x2 m = *a; + pl_matrix2x2_mul(&m, b); + *b = m; +} + +void pl_matrix2x2_scale(pl_matrix2x2 *mat, float scale) +{ + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) + mat->m[i][j] *= scale; + } +} + +void pl_matrix2x2_invert(pl_matrix2x2 *mat) +{ + float m00 = mat->m[0][0], m01 = mat->m[0][1], + m10 = mat->m[1][0], m11 = mat->m[1][1]; + float invdet = 1.0f / (m11 * m00 - m10 * m01); + + mat->m[0][0] = m11 * invdet; + mat->m[0][1] = -m01 * invdet; + mat->m[1][0] = -m10 * invdet; + mat->m[1][1] = m00 * invdet; +} + +const pl_transform2x2 pl_transform2x2_identity = { + .mat = {{ + { 1, 0 }, + { 0, 1 }, + }}, +}; + +void pl_transform2x2_apply(const pl_transform2x2 *t, float vec[2]) +{ + pl_matrix2x2_apply(&t->mat, vec); + + for (int i = 0; i < 2; i++) + vec[i] += t->c[i]; +} + +void pl_transform2x2_apply_rc(const pl_transform2x2 *t, pl_rect2df *rc) +{ + pl_matrix2x2_apply_rc(&t->mat, rc); + + rc->x0 += t->c[0]; + rc->x1 += t->c[0]; + rc->y0 += t->c[1]; + rc->y1 += t->c[1]; +} + +void pl_transform2x2_mul(pl_transform2x2 *a, const pl_transform2x2 *b) +{ + float c[2] = { b->c[0], b->c[1] }; + pl_transform2x2_apply(a, c); + memcpy(a->c, c, sizeof(c)); + pl_matrix2x2_mul(&a->mat, &b->mat); +} + +void pl_transform2x2_rmul(const pl_transform2x2 *a, pl_transform2x2 *b) +{ + pl_transform2x2_apply(a, b->c); + pl_matrix2x2_rmul(&a->mat, &b->mat); +} + +void pl_transform2x2_scale(pl_transform2x2 *t, float scale) +{ + pl_matrix2x2_scale(&t->mat, scale); + + for (int i = 0; i < 2; i++) + t->c[i] *= scale; +} + +void pl_transform2x2_invert(pl_transform2x2 *t) +{ + pl_matrix2x2_invert(&t->mat); + + float m00 = t->mat.m[0][0], m01 = t->mat.m[0][1], + m10 = t->mat.m[1][0], m11 = t->mat.m[1][1]; + float c0 = t->c[0], c1 = t->c[1]; + t->c[0] = -(m00 * c0 + m01 * c1); + t->c[1] = -(m10 * c0 + m11 * c1); +} + +pl_rect2df pl_transform2x2_bounds(const pl_transform2x2 *t, const pl_rect2df *rc) +{ + float p[4][2] = { + { rc->x0, rc->y0 }, + { rc->x0, rc->y1 }, + { rc->x1, rc->y0 }, + { rc->x1, rc->y1 }, + }; + for (int i = 0; i < PL_ARRAY_SIZE(p); i++) + pl_transform2x2_apply(t, p[i]); + + return (pl_rect2df) { + .x0 = fminf(fminf(p[0][0], p[1][0]), fminf(p[2][0], p[3][0])), + .x1 = fmaxf(fmaxf(p[0][0], p[1][0]), fmaxf(p[2][0], p[3][0])), + .y0 = fminf(fminf(p[0][1], p[1][1]), fminf(p[2][1], p[3][1])), + .y1 = fmaxf(fmaxf(p[0][1], p[1][1]), fmaxf(p[2][1], p[3][1])), + }; +} + +float pl_rect2df_aspect(const pl_rect2df *rc) +{ + float w = fabsf(pl_rect_w(*rc)), h = fabsf(pl_rect_h(*rc)); + return h ? (w / h) : 0.0; +} + +void pl_rect2df_aspect_set(pl_rect2df *rc, float aspect, float panscan) +{ + pl_assert(aspect >= 0); + float orig_aspect = pl_rect2df_aspect(rc); + if (!aspect || !orig_aspect) + return; + + float scale_x, scale_y; + if (aspect > orig_aspect) { + // New aspect is wider than the original, so we need to either grow in + // scale_x (panscan=1) or shrink in scale_y (panscan=0) + scale_x = powf(aspect / orig_aspect, panscan); + scale_y = powf(aspect / orig_aspect, panscan - 1.0); + } else if (aspect < orig_aspect) { + // New aspect is taller, so either grow in scale_y (panscan=1) or + // shrink in scale_x (panscan=0) + scale_x = powf(orig_aspect / aspect, panscan - 1.0); + scale_y = powf(orig_aspect / aspect, panscan); + } else { + return; // No change in aspect + } + + pl_rect2df_stretch(rc, scale_x, scale_y); +} + +void pl_rect2df_aspect_fit(pl_rect2df *rc, const pl_rect2df *src, float panscan) +{ + float orig_w = fabs(pl_rect_w(*rc)), + orig_h = fabs(pl_rect_h(*rc)); + if (!orig_w || !orig_h) + return; + + // If either one of these is larger than 1, then we need to shrink to fit, + // otherwise we can just directly stretch the rect. + float scale_x = fabs(pl_rect_w(*src)) / orig_w, + scale_y = fabs(pl_rect_h(*src)) / orig_h; + + if (scale_x > 1.0 || scale_y > 1.0) { + pl_rect2df_aspect_copy(rc, src, panscan); + } else { + pl_rect2df_stretch(rc, scale_x, scale_y); + } +} + +void pl_rect2df_stretch(pl_rect2df *rc, float stretch_x, float stretch_y) +{ + float midx = (rc->x0 + rc->x1) / 2.0, + midy = (rc->y0 + rc->y1) / 2.0; + + rc->x0 = rc->x0 * stretch_x + midx * (1.0 - stretch_x); + rc->x1 = rc->x1 * stretch_x + midx * (1.0 - stretch_x); + rc->y0 = rc->y0 * stretch_y + midy * (1.0 - stretch_y); + rc->y1 = rc->y1 * stretch_y + midy * (1.0 - stretch_y); +} + +void pl_rect2df_offset(pl_rect2df *rc, float offset_x, float offset_y) +{ + if (rc->x1 < rc->x0) + offset_x = -offset_x; + if (rc->y1 < rc->y0) + offset_y = -offset_y; + + rc->x0 += offset_x; + rc->x1 += offset_x; + rc->y0 += offset_y; + rc->y1 += offset_y; +} + +void pl_rect2df_rotate(pl_rect2df *rc, pl_rotation rot) +{ + if (!(rot = pl_rotation_normalize(rot))) + return; + + float x0 = rc->x0, y0 = rc->y0, x1 = rc->x1, y1 = rc->y1; + if (rot >= PL_ROTATION_180) { + rot -= PL_ROTATION_180; + PL_SWAP(x0, x1); + PL_SWAP(y0, y1); + } + + switch (rot) { + case PL_ROTATION_0: + *rc = (pl_rect2df) { + .x0 = x0, + .y0 = y0, + .x1 = x1, + .y1 = y1, + }; + return; + case PL_ROTATION_90: + *rc = (pl_rect2df) { + .x0 = y1, + .y0 = x0, + .x1 = y0, + .y1 = x1, + }; + return; + default: pl_unreachable(); + } +} diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..0cac24d --- /dev/null +++ b/src/common.h @@ -0,0 +1,191 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#define __STDC_FORMAT_MACROS + +#ifdef __cplusplus +#include <version> +#endif + +#if !defined(__cplusplus) || defined(__cpp_lib_stdatomic_h) +#define PL_HAVE_STDATOMIC +#endif + +#ifdef PL_HAVE_STDATOMIC +#include <stdatomic.h> +#endif +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <inttypes.h> + +#if defined(__MINGW32__) && !defined(__clang__) +#define PL_PRINTF(fmt, va) __attribute__ ((format(gnu_printf, fmt, va))) \ + __attribute__ ((nonnull(fmt))) +#elif defined(__GNUC__) +#define PL_PRINTF(fmt, va) __attribute__ ((format(printf, fmt, va))) \ + __attribute__ ((nonnull(fmt))) +#else +#define PL_PRINTF(fmt, va) +#endif + +#define PL_NOINLINE __attribute__((noinline)) + +#include "os.h" + +#include "config_internal.h" + +#define PL_DEPRECATED + +#include <libplacebo/config.h> + +#include "pl_assert.h" +#include "pl_alloc.h" +#include "pl_clock.h" +#include "pl_string.h" + +#if PL_API_VER != BUILD_API_VER +#error Header mismatch? <libplacebo/config.h> pulled from elsewhere! +#endif + +// Divide a number while rounding up (careful: double-eval) +#define PL_DIV_UP(x, y) (((x) + (y) - 1) / (y)) + +// Align up to the nearest multiple of an arbitrary alignment, which may also +// be 0 to signal no alignment requirements. +#define PL_ALIGN(x, align) ((align) ? PL_DIV_UP(x, align) * (align) : (x)) + +// This is faster but must only be called on positive powers of two. +#define PL_ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +// Returns the log base 2 of an unsigned long long +#define PL_LOG2(x) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((x)) - 1)) + +// Rounds a number up to the nearest power of two +#define PL_ALIGN_POT(x) (0x1LLU << (PL_LOG2((x) - 1) + 1)) + +// Right shift a number while rounding up +#define PL_RSHIFT_UP(x, s) -((-(x)) >> (s)) + +// Returns whether or not a number is a power of two (or zero) +#define PL_ISPOT(x) (((x) & ((x) - 1)) == 0) + +// Returns the size of a static array with known size. +#define PL_ARRAY_SIZE(s) (sizeof(s) / sizeof((s)[0])) + +// Swaps two variables +#define PL_SWAP(a, b) \ + do { \ + __typeof__ (a) _tmp = (a); \ + (a) = (b); \ + (b) = _tmp; \ + } while (0) + +// Helper functions for transposing a matrix in-place. +#define PL_TRANSPOSE_DIM(d, m) \ + pl_transpose((d), (float[(d)*(d)]){0}, (const float *)(m)) + +#define PL_TRANSPOSE_2X2(m) PL_TRANSPOSE_DIM(2, m) +#define PL_TRANSPOSE_3X3(m) PL_TRANSPOSE_DIM(3, m) +#define PL_TRANSPOSE_4X4(m) PL_TRANSPOSE_DIM(4, m) + +static inline float *pl_transpose(int dim, float *out, const float *in) +{ + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) + out[i * dim + j] = in[j * dim + i]; + } + + return out; +} + +// Helper functions for some common numeric operations (careful: double-eval) +#define PL_MAX(x, y) ((x) > (y) ? (x) : (y)) +#define PL_MAX3(x, y, z) PL_MAX(PL_MAX(x, y), z) +#define PL_MIN(x, y) ((x) < (y) ? (x) : (y)) +#define PL_CLAMP(x, l, h) ((x) < (l) ? (l) : (x) > (h) ? (h) : (x)) +#define PL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define PL_DEF(x, d) ((x) ? (x) : (d)) +#define PL_SQUARE(x) ((x) * (x)) +#define PL_CUBE(x) ((x) * (x) * (x)) +#define PL_MIX(a, b, x) ((x) * (b) + (1 - (x)) * (a)) + +static inline float pl_smoothstep(float edge0, float edge1, float x) +{ + if (edge0 == edge1) + return x >= edge0; + x = (x - edge0) / (edge1 - edge0); + x = PL_CLAMP(x, 0.0f, 1.0f); + return x * x * (3.0f - 2.0f * x); +} + +// Helpers for doing alignment calculations +static inline size_t pl_gcd(size_t x, size_t y) +{ + assert(x && y); + while (y) { + size_t tmp = y; + y = x % y; + x = tmp; + } + + return x; +} + +static inline size_t pl_lcm(size_t x, size_t y) +{ + assert(x && y); + return x * (y / pl_gcd(x, y)); +} + +// Conditional abort() macro that depends on the configuration option +#ifdef PL_DEBUG_ABORT +# define pl_debug_abort() do { \ + fprintf(stderr, "pl_debug_abort() triggered!\n"); \ + abort(); \ +} while (0) +#else +# define pl_debug_abort() do {} while (0) +#endif + +#ifdef PL_HAVE_STDATOMIC + +// Refcounting helpers +typedef atomic_uint_fast32_t pl_rc_t; +#define pl_rc_init(rc) atomic_init(rc, 1) +#define pl_rc_ref(rc) ((void) atomic_fetch_add_explicit(rc, 1, memory_order_acquire)) +#define pl_rc_deref(rc) (atomic_fetch_sub_explicit(rc, 1, memory_order_release) == 1) +#define pl_rc_count(rc) atomic_load(rc) + +#endif + +#define pl_unreachable() (assert(!"unreachable"), __builtin_unreachable()) + +// Helper for parameter validation +#define pl_require(ctx, expr) \ + do { \ + if (!(expr)) { \ + PL_ERR(ctx, "Validation failed: %s (%s:%d)", \ + #expr, __FILE__, __LINE__); \ + pl_log_stack_trace(ctx->log, PL_LOG_ERR); \ + pl_debug_abort(); \ + goto error; \ + } \ + } while (0) diff --git a/src/convert.cc b/src/convert.cc new file mode 100644 index 0000000..05c9dd0 --- /dev/null +++ b/src/convert.cc @@ -0,0 +1,233 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <charconv> +#include <limits> +#include <system_error> + +#if __has_include(<fast_float/fast_float.h>) +# include <fast_float/fast_float.h> +#endif + +#include "pl_string.h" + +[[maybe_unused]] +static int ccStrPrintDouble( char *str, int bufsize, int decimals, double value ); + +namespace { + +template <typename T> +struct has_std_to_chars_impl { + template <typename CT> + static auto _(CT s) -> decltype(std::to_chars(s, s, std::declval<T>()), std::true_type{}); + static auto _(...) -> std::false_type; + static constexpr bool value = decltype(_((char *){}))::value; +}; + +template <typename T> +constexpr bool has_std_to_chars = has_std_to_chars_impl<T>::value; + +template <typename T, typename... Args> +static inline int to_chars(char *buf, size_t len, T n, Args ...args) +{ + if constexpr (has_std_to_chars<T>) { + auto [ptr, ec] = std::to_chars(buf, buf + len, n, args...); + return ec == std::errc() ? ptr - buf : 0; + } else { + static_assert(std::is_same_v<float, T> || std::is_same_v<double, T>, + "Not implemented!"); + // FIXME: Fallback for GCC <= 10 currently required for MinGW-w64 on + // Ubuntu 22.04. Remove this when Ubuntu 24.04 is released, as it will + // provide newer MinGW-w64 GCC and it will be safe to require it. + return ccStrPrintDouble(buf, len, std::numeric_limits<T>::max_digits10, n); + } +} + +template <typename T> +struct has_std_from_chars_impl { + template <typename CT> + static auto _(CT s) -> decltype(std::from_chars(s, s, std::declval<T&>()), std::true_type{}); + static auto _(...) -> std::false_type; + static constexpr bool value = decltype(_((const char *){}))::value; +}; + +template <typename T> +constexpr bool has_std_from_chars = has_std_from_chars_impl<T>::value; + +template <typename T, typename... Args> +static inline bool from_chars(pl_str str, T &n, Args ...args) +{ + if constexpr (has_std_from_chars<T>) { + auto [ptr, ec] = std::from_chars((const char *) str.buf, + (const char *) str.buf + str.len, + n, args...); + return ec == std::errc(); + } else { + constexpr bool is_fp = std::is_same_v<float, T> || std::is_same_v<double, T>; + static_assert(is_fp, "Not implemented!"); +#if !__has_include(<fast_float/fast_float.h>) + static_assert(!is_fp, "<fast_float/fast_float.h> is required, but not " \ + "found. Please run `git submodule update --init`" \ + " or provide <fast_float/fast_float.h>"); +#else + // FIXME: Fallback for libc++, as it does not implement floating-point + // variant of std::from_chars. Remove this when appropriate. + auto [ptr, ec] = fast_float::from_chars((const char *) str.buf, + (const char *) str.buf + str.len, + n, args...); + return ec == std::errc(); +#endif + } +} + +} + +#define CHAR_CONVERT(name, type, ...) \ + int pl_str_print_##name(char *buf, size_t len, type n) \ + { \ + return to_chars(buf, len, n __VA_OPT__(,) __VA_ARGS__); \ + } \ + bool pl_str_parse_##name(pl_str str, type *n) \ + { \ + return from_chars(str, *n __VA_OPT__(,) __VA_ARGS__); \ + } + +CHAR_CONVERT(hex, unsigned short, 16) +CHAR_CONVERT(int, int) +CHAR_CONVERT(uint, unsigned int) +CHAR_CONVERT(int64, int64_t) +CHAR_CONVERT(uint64, uint64_t) +CHAR_CONVERT(float, float) +CHAR_CONVERT(double, double) + +/* ***************************************************************************** + * + * Copyright (c) 2007-2016 Alexis Naveros. + * Modified for use with libplacebo by Niklas Haas + * Changes include: + * - Removed a CC_MIN macro dependency by equivalent logic + * - Removed CC_ALWAYSINLINE + * - Fixed (!seq) check to (!seqlength) + * - Added support for scientific notation (e.g. 1.0e10) in ccSeqParseDouble + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * ----------------------------------------------------------------------------- + */ + +static int ccStrPrintDouble( char *str, int bufsize, int decimals, double value ) +{ + int size, offset, index; + int32_t frac, accumsub; + double muldec; + uint32_t u32; + uint64_t u64; + + size = 0; + if( value < 0.0 ) + { + size = 1; + *str++ = '-'; + bufsize--; + value = -value; + } + + if( value < 4294967296.0 ) + { + u32 = (uint32_t)value; + offset = pl_str_print_uint( str, bufsize, u32 ); + if (!offset) + goto error; + size += offset; + bufsize -= size; + value -= (double)u32; + } + else if( value < 18446744073709551616.0 ) + { + u64 = (uint64_t)value; + offset = pl_str_print_uint64( str, bufsize, u64 ); + if (!offset) + goto error; + size += offset; + bufsize -= size; + value -= (double)u64; + } + else + goto error; + + if (decimals > bufsize - 2) + decimals = bufsize - 2; + if( decimals <= 0 ) + return size; + + muldec = 10.0; + accumsub = 0; + str += offset; + + for( index = 0 ; index < decimals ; index++ ) + { + // Skip printing insignificant decimal digits + if (value * muldec - accumsub <= std::numeric_limits<double>::epsilon()) + break; + if (index == 0) { + size += 1; + *str++ = '.'; + } + frac = (int32_t)( value * muldec ) - accumsub; + frac = PL_CLAMP(frac, 0, 9); // FIXME: why is this needed? + str[index] = '0' + (char)frac; + accumsub += frac; + accumsub = ( accumsub << 3 ) + ( accumsub << 1 ); + if( muldec < 10000000 ) + muldec *= 10.0; + else + { + value *= 10000000.0; + value -= (int32_t)value; + muldec = 10.0; + accumsub = 0; + } + } + // Round up the last decimal digit + if ( str[ index - 1 ] < '9' && (int32_t)( value * muldec ) - accumsub >= 5 ) + str[ index - 1 ]++; + str[ index ] = 0; + size += index; + return size; + +error: + if( bufsize < 4 ) + *str = 0; + else + { + str[0] = 'E'; + str[1] = 'R'; + str[2] = 'R'; + str[3] = 0; + } + return 0; +} diff --git a/src/d3d11/common.h b/src/d3d11/common.h new file mode 100644 index 0000000..e14b709 --- /dev/null +++ b/src/d3d11/common.h @@ -0,0 +1,66 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "../common.h" +#include "../log.h" + +#ifdef PL_HAVE_DXGI_DEBUG +#include <dxgidebug.h> +#endif + +#include <libplacebo/d3d11.h> + +// Shared struct used to hold the D3D11 device and associated interfaces +struct d3d11_ctx { + pl_log log; + pl_d3d11 d3d11; + + // Copy of the device from pl_d3d11 for convenience. Does not hold an + // additional reference. + ID3D11Device *dev; + + // DXGI device. This does hold a reference. + IDXGIDevice1 *dxgi_dev; + +#ifdef PL_HAVE_DXGI_DEBUG + // Debug interfaces + IDXGIDebug *debug; + IDXGIInfoQueue *iqueue; + uint64_t last_discarded; // Last count of discarded messages + DXGI_INFO_QUEUE_MESSAGE *dxgi_msg; +#endif + + // pl_gpu_is_failed (We saw a device removed error!) + bool is_failed; +}; + +// DDK value. Apparently some D3D functions can return this instead of the +// proper user-mode error code. See: +// https://docs.microsoft.com/en-us/windows/win32/api/dxgi/nf-dxgi-idxgiswapchain-present +#define D3DDDIERR_DEVICEREMOVED (0x88760870) + +#ifndef D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE +#define D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE (0x80) +#endif +#ifndef D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD +#define D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD (0x40) +#endif +#ifndef PL_HAVE_DXGI_DEBUG_D3D11 +DEFINE_GUID(DXGI_DEBUG_D3D11, 0x4b99317b, 0xac39, 0x4aa6, 0xbb, 0xb, 0xba, 0xa0, 0x47, 0x84, 0x79, 0x8f); +#endif diff --git a/src/d3d11/context.c b/src/d3d11/context.c new file mode 100644 index 0000000..e0ba90f --- /dev/null +++ b/src/d3d11/context.c @@ -0,0 +1,488 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" + +// Windows 8 enum value, not present in mingw-w64 v7 +#define DXGI_ADAPTER_FLAG_SOFTWARE (2) + +const struct pl_d3d11_params pl_d3d11_default_params = { PL_D3D11_DEFAULTS }; + +static INIT_ONCE d3d11_once = INIT_ONCE_STATIC_INIT; +static PFN_D3D11_CREATE_DEVICE pD3D11CreateDevice = NULL; +static __typeof__(&CreateDXGIFactory1) pCreateDXGIFactory1 = NULL; +#ifdef PL_HAVE_DXGI_DEBUG +static __typeof__(&DXGIGetDebugInterface) pDXGIGetDebugInterface = NULL; +#endif + +static void d3d11_load(void) +{ + BOOL bPending = FALSE; + InitOnceBeginInitialize(&d3d11_once, 0, &bPending, NULL); + + if (bPending) + { + HMODULE d3d11 = LoadLibraryW(L"d3d11.dll"); + if (d3d11) { + pD3D11CreateDevice = (void *) + GetProcAddress(d3d11, "D3D11CreateDevice"); + } + + HMODULE dxgi = LoadLibraryW(L"dxgi.dll"); + if (dxgi) { + pCreateDXGIFactory1 = (void *) + GetProcAddress(dxgi, "CreateDXGIFactory1"); + } + +#ifdef PL_HAVE_DXGI_DEBUG + HMODULE dxgi_debug = LoadLibraryW(L"dxgidebug.dll"); + if (dxgi_debug) { + pDXGIGetDebugInterface = (void *) + GetProcAddress(dxgi_debug, "DXGIGetDebugInterface"); + } +#endif + } + + InitOnceComplete(&d3d11_once, 0, NULL); +} + +// Get a const array of D3D_FEATURE_LEVELs from max_fl to min_fl (inclusive) +static int get_feature_levels(int max_fl, int min_fl, + const D3D_FEATURE_LEVEL **out) +{ + static const D3D_FEATURE_LEVEL levels[] = { + D3D_FEATURE_LEVEL_12_1, + D3D_FEATURE_LEVEL_12_0, + D3D_FEATURE_LEVEL_11_1, + D3D_FEATURE_LEVEL_11_0, + D3D_FEATURE_LEVEL_10_1, + D3D_FEATURE_LEVEL_10_0, + D3D_FEATURE_LEVEL_9_3, + D3D_FEATURE_LEVEL_9_2, + D3D_FEATURE_LEVEL_9_1, + }; + static const int levels_len = PL_ARRAY_SIZE(levels); + + int start = 0; + for (; start < levels_len; start++) { + if (levels[start] <= max_fl) + break; + } + int len = 0; + for (; start + len < levels_len; len++) { + if (levels[start + len] < min_fl) + break; + } + *out = &levels[start]; + return len; +} + +static bool is_null_luid(LUID luid) +{ + return luid.LowPart == 0 && luid.HighPart == 0; +} + +static IDXGIAdapter *get_adapter(pl_d3d11 d3d11, LUID adapter_luid) +{ + struct d3d11_ctx *ctx = PL_PRIV(d3d11); + IDXGIFactory1 *factory = NULL; + IDXGIAdapter1 *adapter1 = NULL; + IDXGIAdapter *adapter = NULL; + HRESULT hr; + + if (!pCreateDXGIFactory1) { + PL_FATAL(ctx, "Failed to load dxgi.dll"); + goto error; + } + pCreateDXGIFactory1(&IID_IDXGIFactory1, (void **) &factory); + + for (int i = 0;; i++) { + hr = IDXGIFactory1_EnumAdapters1(factory, i, &adapter1); + if (hr == DXGI_ERROR_NOT_FOUND) + break; + if (FAILED(hr)) { + PL_FATAL(ctx, "Failed to enumerate adapters"); + goto error; + } + + DXGI_ADAPTER_DESC1 desc; + D3D(IDXGIAdapter1_GetDesc1(adapter1, &desc)); + if (desc.AdapterLuid.LowPart == adapter_luid.LowPart && + desc.AdapterLuid.HighPart == adapter_luid.HighPart) + { + break; + } + + SAFE_RELEASE(adapter1); + } + if (!adapter1) { + PL_FATAL(ctx, "Adapter with LUID %08lx%08lx not found", + adapter_luid.HighPart, adapter_luid.LowPart); + goto error; + } + + D3D(IDXGIAdapter1_QueryInterface(adapter1, &IID_IDXGIAdapter, + (void **) &adapter)); + +error: + SAFE_RELEASE(factory); + SAFE_RELEASE(adapter1); + return adapter; +} + +static bool has_sdk_layers(void) +{ + // This will fail if the SDK layers aren't installed + return SUCCEEDED(pD3D11CreateDevice(NULL, D3D_DRIVER_TYPE_NULL, NULL, + D3D11_CREATE_DEVICE_DEBUG, NULL, 0, D3D11_SDK_VERSION, NULL, NULL, + NULL)); +} + +static ID3D11Device *create_device(struct pl_d3d11_t *d3d11, + const struct pl_d3d11_params *params) +{ + struct d3d11_ctx *ctx = PL_PRIV(d3d11); + bool debug = params->debug; + bool warp = params->force_software; + int max_fl = params->max_feature_level; + int min_fl = params->min_feature_level; + ID3D11Device *dev = NULL; + IDXGIDevice1 *dxgi_dev = NULL; + IDXGIAdapter *adapter = NULL; + bool release_adapter = false; + HRESULT hr; + + d3d11_load(); + + if (!pD3D11CreateDevice) { + PL_FATAL(ctx, "Failed to load d3d11.dll"); + goto error; + } + + if (params->adapter) { + adapter = params->adapter; + } else if (!is_null_luid(params->adapter_luid)) { + adapter = get_adapter(d3d11, params->adapter_luid); + release_adapter = true; + } + + if (debug && !has_sdk_layers()) { + PL_INFO(ctx, "Debug layer not available, removing debug flag"); + debug = false; + } + + // Return here to retry creating the device + do { + // Use these default feature levels if they are not set + max_fl = PL_DEF(max_fl, D3D_FEATURE_LEVEL_12_1); + min_fl = PL_DEF(min_fl, D3D_FEATURE_LEVEL_9_1); + + // Get a list of feature levels from min_fl to max_fl + const D3D_FEATURE_LEVEL *levels; + int levels_len = get_feature_levels(max_fl, min_fl, &levels); + if (!levels_len) { + PL_FATAL(ctx, "No suitable Direct3D feature level found"); + goto error; + } + + D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_UNKNOWN; + if (!adapter) { + if (warp) { + type = D3D_DRIVER_TYPE_WARP; + } else { + type = D3D_DRIVER_TYPE_HARDWARE; + } + } + + UINT flags = params->flags; + if (debug) + flags |= D3D11_CREATE_DEVICE_DEBUG; + + hr = pD3D11CreateDevice(adapter, type, NULL, flags, levels, levels_len, + D3D11_SDK_VERSION, &dev, NULL, NULL); + if (SUCCEEDED(hr)) + break; + + pl_d3d11_after_error(ctx, hr); + + // Trying to create a D3D_FEATURE_LEVEL_12_0 device on Windows 8.1 or + // below will not succeed. Try an 11_1 device. + if (hr == E_INVALIDARG && max_fl >= D3D_FEATURE_LEVEL_12_0 && + min_fl <= D3D_FEATURE_LEVEL_11_1) { + PL_DEBUG(ctx, "Failed to create 12_0+ device, trying 11_1"); + max_fl = D3D_FEATURE_LEVEL_11_1; + continue; + } + + // Trying to create a D3D_FEATURE_LEVEL_11_1 device on Windows 7 + // without the platform update will not succeed. Try an 11_0 device. + if (hr == E_INVALIDARG && max_fl >= D3D_FEATURE_LEVEL_11_1 && + min_fl <= D3D_FEATURE_LEVEL_11_0) { + PL_DEBUG(ctx, "Failed to create 11_1+ device, trying 11_0"); + max_fl = D3D_FEATURE_LEVEL_11_0; + continue; + } + + // Retry with WARP if allowed + if (!adapter && !warp && params->allow_software) { + PL_DEBUG(ctx, "Failed to create hardware device, trying WARP: %s", + pl_hresult_to_str(hr)); + warp = true; + max_fl = params->max_feature_level; + min_fl = params->min_feature_level; + continue; + } + + PL_FATAL(ctx, "Failed to create Direct3D 11 device: %s", + pl_hresult_to_str(hr)); + goto error; + } while (true); + + if (params->max_frame_latency) { + D3D(ID3D11Device_QueryInterface(dev, &IID_IDXGIDevice1, + (void **) &dxgi_dev)); + IDXGIDevice1_SetMaximumFrameLatency(dxgi_dev, params->max_frame_latency); + } + + d3d11->software = warp; + +error: + if (release_adapter) + SAFE_RELEASE(adapter); + SAFE_RELEASE(dxgi_dev); + return dev; +} + +static void init_debug_layer(struct d3d11_ctx *ctx, bool leak_check) +{ +#ifdef PL_HAVE_DXGI_DEBUG + if (!pDXGIGetDebugInterface) + d3d11_load(); + + if (!pDXGIGetDebugInterface) + goto error; + + D3D(pDXGIGetDebugInterface(&IID_IDXGIInfoQueue, (void **) &ctx->iqueue)); + + // Push empty filter to get everything + IDXGIInfoQueue_PushStorageFilter(ctx->iqueue, DXGI_DEBUG_ALL, + &(DXGI_INFO_QUEUE_FILTER){0}); + + // Filter some annoying D3D11 messages + DXGI_INFO_QUEUE_MESSAGE_ID deny_ids[] = { + // This false-positive error occurs every time we Draw() with a shader + // that samples from a texture format that only supports point sampling. + // Since we already use CheckFormatSupport to know which formats can be + // linearly sampled from, we shouldn't ever bind a non-point sampler to + // a format that doesn't support it. + D3D11_MESSAGE_ID_DEVICE_DRAW_RESOURCE_FORMAT_SAMPLE_UNSUPPORTED, + }; + DXGI_INFO_QUEUE_FILTER filter = { + .DenyList = { + .NumIDs = PL_ARRAY_SIZE(deny_ids), + .pIDList = deny_ids, + }, + }; + IDXGIInfoQueue_PushStorageFilter(ctx->iqueue, DXGI_DEBUG_D3D11, &filter); + + IDXGIInfoQueue_SetMessageCountLimit(ctx->iqueue, DXGI_DEBUG_D3D11, -1); + IDXGIInfoQueue_SetMessageCountLimit(ctx->iqueue, DXGI_DEBUG_DXGI, -1); + + if (leak_check) + D3D(pDXGIGetDebugInterface(&IID_IDXGIDebug, (void **) &ctx->debug)); + +error: + return; +#endif +} + +void pl_d3d11_destroy(pl_d3d11 *ptr) +{ + pl_d3d11 d3d11 = *ptr; + if (!d3d11) + return; + struct d3d11_ctx *ctx = PL_PRIV(d3d11); + + pl_gpu_destroy(d3d11->gpu); + + SAFE_RELEASE(ctx->dev); + SAFE_RELEASE(ctx->dxgi_dev); + +#ifdef PL_HAVE_DXGI_DEBUG + if (ctx->debug) { + // Report any leaked objects + pl_d3d11_flush_message_queue(ctx, "After destroy"); + IDXGIDebug_ReportLiveObjects(ctx->debug, DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_DETAIL); + pl_d3d11_flush_message_queue(ctx, "After leak check"); + IDXGIDebug_ReportLiveObjects(ctx->debug, DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_SUMMARY); + pl_d3d11_flush_message_queue(ctx, "After leak summary"); + } + + SAFE_RELEASE(ctx->debug); + SAFE_RELEASE(ctx->iqueue); +#endif + + pl_free_ptr((void **) ptr); +} + +pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params) +{ + params = PL_DEF(params, &pl_d3d11_default_params); + IDXGIAdapter1 *adapter = NULL; + IDXGIAdapter2 *adapter2 = NULL; + bool success = false; + HRESULT hr; + + struct pl_d3d11_t *d3d11 = pl_zalloc_obj(NULL, d3d11, struct d3d11_ctx); + struct d3d11_ctx *ctx = PL_PRIV(d3d11); + ctx->log = log; + ctx->d3d11 = d3d11; + + if (params->device) { + d3d11->device = params->device; + ID3D11Device_AddRef(d3d11->device); + } else { + d3d11->device = create_device(d3d11, params); + if (!d3d11->device) + goto error; + } + ctx->dev = d3d11->device; + + if (params->debug || + ID3D11Device_GetCreationFlags(d3d11->device) & D3D11_CREATE_DEVICE_DEBUG) + { + // Do not report live object on pl_d3d11_destroy if device was created + // externally, it makes no sense as there will be a lot of things alive. + init_debug_layer(ctx, !params->device); + } + + D3D(ID3D11Device_QueryInterface(d3d11->device, &IID_IDXGIDevice1, + (void **) &ctx->dxgi_dev)); + D3D(IDXGIDevice1_GetParent(ctx->dxgi_dev, &IID_IDXGIAdapter1, + (void **) &adapter)); + + hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter2, + (void **) &adapter2); + if (FAILED(hr)) + adapter2 = NULL; + + if (adapter2) { + PL_INFO(ctx, "Using DXGI 1.2+"); + } else { + PL_INFO(ctx, "Using DXGI 1.1"); + } + + D3D_FEATURE_LEVEL fl = ID3D11Device_GetFeatureLevel(d3d11->device); + PL_INFO(ctx, "Using Direct3D 11 feature level %u_%u", + ((unsigned) fl) >> 12, (((unsigned) fl) >> 8) & 0xf); + + char *dev_name = NULL; + UINT vendor_id, device_id, revision, subsys_id; + LUID adapter_luid; + UINT flags; + + if (adapter2) { + // DXGI 1.2 IDXGIAdapter2::GetDesc2 is preferred over the DXGI 1.1 + // version because it reports the real adapter information when using + // feature level 9 hardware + DXGI_ADAPTER_DESC2 desc; + D3D(IDXGIAdapter2_GetDesc2(adapter2, &desc)); + + dev_name = pl_to_utf8(NULL, desc.Description); + vendor_id = desc.VendorId; + device_id = desc.DeviceId; + revision = desc.Revision; + subsys_id = desc.SubSysId; + adapter_luid = desc.AdapterLuid; + flags = desc.Flags; + } else { + DXGI_ADAPTER_DESC1 desc; + D3D(IDXGIAdapter1_GetDesc1(adapter, &desc)); + + dev_name = pl_to_utf8(NULL, desc.Description); + vendor_id = desc.VendorId; + device_id = desc.DeviceId; + revision = desc.Revision; + subsys_id = desc.SubSysId; + adapter_luid = desc.AdapterLuid; + flags = desc.Flags; + } + + PL_INFO(ctx, "Direct3D 11 device properties:"); + PL_INFO(ctx, " Device Name: %s", dev_name); + PL_INFO(ctx, " Device ID: %04x:%04x (rev %02x)", + vendor_id, device_id, revision); + PL_INFO(ctx, " Subsystem ID: %04x:%04x", + LOWORD(subsys_id), HIWORD(subsys_id)); + PL_INFO(ctx, " LUID: %08lx%08lx", + adapter_luid.HighPart, adapter_luid.LowPart); + pl_free(dev_name); + + LARGE_INTEGER version; + hr = IDXGIAdapter1_CheckInterfaceSupport(adapter, &IID_IDXGIDevice, &version); + if (SUCCEEDED(hr)) { + PL_INFO(ctx, " Driver version: %u.%u.%u.%u", + HIWORD(version.HighPart), LOWORD(version.HighPart), + HIWORD(version.LowPart), LOWORD(version.LowPart)); + } + + // Note: DXGI_ADAPTER_FLAG_SOFTWARE doesn't exist before Windows 8, but we + // also set d3d11->software in create_device if we pick WARP ourselves + if (flags & DXGI_ADAPTER_FLAG_SOFTWARE) + d3d11->software = true; + + // If the primary display adapter is a software adapter, the + // DXGI_ADAPTER_FLAG_SOFTWARE flag won't be set, but the device IDs should + // still match the Microsoft Basic Render Driver + if (vendor_id == 0x1414 && device_id == 0x8c) + d3d11->software = true; + + if (d3d11->software) { + bool external_adapter = params->device || params->adapter || + !is_null_luid(params->adapter_luid); + + // The allow_software flag only applies if the API user didn't manually + // specify an adapter or a device + if (!params->allow_software && !external_adapter) { + // If we got this far with allow_software set, the primary adapter + // must be a software adapter + PL_ERR(ctx, "Primary adapter is a software adapter"); + goto error; + } + + // If a software adapter was manually specified, don't show a warning + enum pl_log_level level = PL_LOG_WARN; + if (external_adapter || params->force_software) + level = PL_LOG_INFO; + + PL_MSG(ctx, level, "Using a software adapter"); + } + + d3d11->gpu = pl_gpu_create_d3d11(ctx); + if (!d3d11->gpu) + goto error; + + success = true; +error: + if (!success) { + PL_FATAL(ctx, "Failed initializing Direct3D 11 device"); + pl_d3d11_destroy((pl_d3d11 *) &d3d11); + } + SAFE_RELEASE(adapter); + SAFE_RELEASE(adapter2); + return d3d11; +} diff --git a/src/d3d11/formats.c b/src/d3d11/formats.c new file mode 100644 index 0000000..7aaec26 --- /dev/null +++ b/src/d3d11/formats.c @@ -0,0 +1,293 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "formats.h" +#include "gpu.h" + +#define FMT(_minor, _name, _dxfmt, _type, num, size, bits, order) \ + (struct d3d_format) { \ + .dxfmt = DXGI_FORMAT_##_dxfmt##_##_type, \ + .minor = _minor, \ + .fmt = { \ + .name = _name, \ + .type = PL_FMT_##_type, \ + .num_components = num, \ + .component_depth = bits, \ + .texel_size = size, \ + .texel_align = 1, \ + .internal_size = size, \ + .host_bits = bits, \ + .sample_order = order, \ + }, \ + } + +#define IDX(...) {__VA_ARGS__} +#define BITS(...) {__VA_ARGS__} + +#define REGFMT(name, dxfmt, type, num, bits) \ + FMT(0, name, dxfmt, type, num, (num) * (bits) / 8, \ + BITS(bits, bits, bits, bits), \ + IDX(0, 1, 2, 3)) + +#define EMUFMT(_name, _dxfmt, _type, in, en, ib, eb) \ + (struct d3d_format) { \ + .dxfmt = DXGI_FORMAT_##_dxfmt##_##_type, \ + .minor = 0, \ + .fmt = { \ + .name = _name, \ + .type = PL_FMT_##_type, \ + .num_components = en, \ + .component_depth = BITS(ib, ib, ib, ib), \ + .internal_size = (in) * (ib) / 8, \ + .opaque = false, \ + .emulated = true, \ + .texel_size = (en) * (eb) / 8, \ + .texel_align = (eb) / 8, \ + .host_bits = BITS(eb, eb, eb, eb), \ + .sample_order = IDX(0, 1, 2, 3), \ + }, \ + } + +const struct d3d_format pl_d3d11_formats[] = { + REGFMT("r8", R8, UNORM, 1, 8), + REGFMT("rg8", R8G8, UNORM, 2, 8), + EMUFMT("rgb8", R8G8B8A8, UNORM, 4, 3, 8, 8), + REGFMT("rgba8", R8G8B8A8, UNORM, 4, 8), + REGFMT("r16", R16, UNORM, 1, 16), + REGFMT("rg16", R16G16, UNORM, 2, 16), + EMUFMT("rgb16", R16G16B16A16, UNORM, 4, 3, 16, 16), + REGFMT("rgba16", R16G16B16A16, UNORM, 4, 16), + + REGFMT("r8s", R8, SNORM, 1, 8), + REGFMT("rg8s", R8G8, SNORM, 2, 8), + REGFMT("rgba8s", R8G8B8A8, SNORM, 4, 8), + REGFMT("r16s", R16, SNORM, 1, 16), + REGFMT("rg16s", R16G16, SNORM, 2, 16), + REGFMT("rgba16s", R16G16B16A16, SNORM, 4, 16), + + REGFMT("r16hf", R16, FLOAT, 1, 16), + REGFMT("rg16hf", R16G16, FLOAT, 2, 16), + EMUFMT("rgb16hf", R16G16B16A16, FLOAT, 4, 3, 16, 16), + REGFMT("rgba16hf", R16G16B16A16, FLOAT, 4, 16), + REGFMT("r32f", R32, FLOAT, 1, 32), + REGFMT("rg32f", R32G32, FLOAT, 2, 32), + REGFMT("rgb32f", R32G32B32, FLOAT, 3, 32), + REGFMT("rgba32f", R32G32B32A32, FLOAT, 4, 32), + + EMUFMT("r16f", R16, FLOAT, 1, 1, 16, 32), + EMUFMT("rg16f", R16G16, FLOAT, 2, 2, 16, 32), + EMUFMT("rgb16f", R16G16B16A16, FLOAT, 4, 3, 16, 32), + EMUFMT("rgba16f", R16G16B16A16, FLOAT, 4, 4, 16, 32), + + REGFMT("r8u", R8, UINT, 1, 8), + REGFMT("rg8u", R8G8, UINT, 2, 8), + REGFMT("rgba8u", R8G8B8A8, UINT, 4, 8), + REGFMT("r16u", R16, UINT, 1, 16), + REGFMT("rg16u", R16G16, UINT, 2, 16), + REGFMT("rgba16u", R16G16B16A16, UINT, 4, 16), + REGFMT("r32u", R32, UINT, 1, 32), + REGFMT("rg32u", R32G32, UINT, 2, 32), + REGFMT("rgb32u", R32G32B32, UINT, 3, 32), + REGFMT("rgba32u", R32G32B32A32, UINT, 4, 32), + + REGFMT("r8i", R8, SINT, 1, 8), + REGFMT("rg8i", R8G8, SINT, 2, 8), + REGFMT("rgba8i", R8G8B8A8, SINT, 4, 8), + REGFMT("r16i", R16, SINT, 1, 16), + REGFMT("rg16i", R16G16, SINT, 2, 16), + REGFMT("rgba16i", R16G16B16A16, SINT, 4, 16), + REGFMT("r32i", R32, SINT, 1, 32), + REGFMT("rg32i", R32G32, SINT, 2, 32), + REGFMT("rgb32i", R32G32B32, SINT, 3, 32), + REGFMT("rgba32i", R32G32B32A32, SINT, 4, 32), + + FMT(0, "rgb10a2", R10G10B10A2, UNORM, 4, 4, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3)), + FMT(0, "rgb10a2u", R10G10B10A2, UINT, 4, 4, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3)), + + FMT(0, "bgra8", B8G8R8A8, UNORM, 4, 4, BITS( 8, 8, 8, 8), IDX(2, 1, 0, 3)), + FMT(0, "bgrx8", B8G8R8X8, UNORM, 3, 4, BITS( 8, 8, 8), IDX(2, 1, 0)), + FMT(0, "rg11b10f", R11G11B10, FLOAT, 3, 4, BITS(11, 11, 10), IDX(0, 1, 2)), + + // D3D11.1 16-bit formats (resurrected D3D9 formats) + FMT(1, "bgr565", B5G6R5, UNORM, 3, 2, BITS( 5, 6, 5), IDX(2, 1, 0)), + FMT(1, "bgr5a1", B5G5R5A1, UNORM, 4, 2, BITS( 5, 5, 5, 1), IDX(2, 1, 0, 3)), + FMT(1, "bgra4", B4G4R4A4, UNORM, 4, 2, BITS( 4, 4, 4, 4), IDX(2, 1, 0, 3)), + + {0} +}; +#undef BITS +#undef IDX +#undef REGFMT +#undef FMT + +void pl_d3d11_setup_formats(struct pl_gpu_t *gpu) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + PL_ARRAY(pl_fmt) formats = {0}; + HRESULT hr; + + for (int i = 0; pl_d3d11_formats[i].dxfmt; i++) { + const struct d3d_format *d3d_fmt = &pl_d3d11_formats[i]; + + // The Direct3D 11.0 debug layer will segfault if CheckFormatSupport is + // called on a format it doesn't know about + if (pl_d3d11_formats[i].minor > p->minor) + continue; + + UINT sup = 0; + hr = ID3D11Device_CheckFormatSupport(p->dev, d3d_fmt->dxfmt, &sup); + if (FAILED(hr)) + continue; + + D3D11_FEATURE_DATA_FORMAT_SUPPORT2 sup2 = { .InFormat = d3d_fmt->dxfmt }; + ID3D11Device_CheckFeatureSupport(p->dev, D3D11_FEATURE_FORMAT_SUPPORT2, + ², sizeof(sup2)); + + struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct d3d_fmt *); + const struct d3d_format **fmtp = PL_PRIV(fmt); + *fmt = d3d_fmt->fmt; + *fmtp = d3d_fmt; + + // For sanity, clear the superfluous fields + for (int j = fmt->num_components; j < 4; j++) { + fmt->component_depth[j] = 0; + fmt->sample_order[j] = 0; + fmt->host_bits[j] = 0; + } + + static const struct { + enum pl_fmt_caps caps; + UINT sup; + UINT sup2; + } support[] = { + { + .caps = PL_FMT_CAP_SAMPLEABLE, + .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D, + }, + { + .caps = PL_FMT_CAP_STORABLE, + // SHADER_LOAD is for readonly images, which can use a SRV + .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D | + D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW | + D3D11_FORMAT_SUPPORT_SHADER_LOAD, + .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE, + }, + { + .caps = PL_FMT_CAP_READWRITE, + .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D | + D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW, + .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD, + }, + { + .caps = PL_FMT_CAP_LINEAR, + .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D | + D3D11_FORMAT_SUPPORT_SHADER_SAMPLE, + }, + { + .caps = PL_FMT_CAP_RENDERABLE, + .sup = D3D11_FORMAT_SUPPORT_RENDER_TARGET, + }, + { + .caps = PL_FMT_CAP_BLENDABLE, + .sup = D3D11_FORMAT_SUPPORT_RENDER_TARGET | + D3D11_FORMAT_SUPPORT_BLENDABLE, + }, + { + .caps = PL_FMT_CAP_VERTEX, + .sup = D3D11_FORMAT_SUPPORT_IA_VERTEX_BUFFER, + }, + { + .caps = PL_FMT_CAP_TEXEL_UNIFORM, + .sup = D3D11_FORMAT_SUPPORT_BUFFER | + D3D11_FORMAT_SUPPORT_SHADER_LOAD, + }, + { + .caps = PL_FMT_CAP_TEXEL_STORAGE, + // SHADER_LOAD is for readonly buffers, which can use a SRV + .sup = D3D11_FORMAT_SUPPORT_BUFFER | + D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW | + D3D11_FORMAT_SUPPORT_SHADER_LOAD, + .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE, + }, + { + .caps = PL_FMT_CAP_HOST_READABLE, + .sup = D3D11_FORMAT_SUPPORT_CPU_LOCKABLE, + }, + }; + + for (int j = 0; j < PL_ARRAY_SIZE(support); j++) { + if ((sup & support[j].sup) == support[j].sup && + (sup2.OutFormatSupport2 & support[j].sup2) == support[j].sup2) + { + fmt->caps |= support[j].caps; + } + } + + // PL_FMT_CAP_STORABLE implies compute shaders, so don't set it if we + // don't have them + if (!gpu->glsl.compute) + fmt->caps &= ~PL_FMT_CAP_STORABLE; + + // PL_FMT_CAP_READWRITE implies PL_FMT_CAP_STORABLE + if (!(fmt->caps & PL_FMT_CAP_STORABLE)) + fmt->caps &= ~PL_FMT_CAP_READWRITE; + + // `fmt->gatherable` must have PL_FMT_CAP_SAMPLEABLE + if ((fmt->caps & PL_FMT_CAP_SAMPLEABLE) && + (sup & D3D11_FORMAT_SUPPORT_SHADER_GATHER)) + { + fmt->gatherable = true; + } + + // PL_FMT_CAP_BLITTABLE implies support for stretching, flipping and + // loose format conversion, which require a shader pass in D3D11 + if (p->fl >= D3D_FEATURE_LEVEL_11_0) { + // On >=FL11_0, we use a compute pass, which supports 1D and 3D + // textures + if (fmt->caps & PL_FMT_CAP_STORABLE) + fmt->caps |= PL_FMT_CAP_BLITTABLE; + } else { + // On <FL11_0 we use a raster pass + static const enum pl_fmt_caps req = PL_FMT_CAP_RENDERABLE | + PL_FMT_CAP_SAMPLEABLE; + if ((fmt->caps & req) == req) + fmt->caps |= PL_FMT_CAP_BLITTABLE; + } + + if (fmt->caps & (PL_FMT_CAP_VERTEX | PL_FMT_CAP_TEXEL_UNIFORM | + PL_FMT_CAP_TEXEL_STORAGE)) { + fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, "")); + pl_assert(fmt->glsl_type); + } + + if (fmt->caps & (PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE)) + fmt->glsl_format = pl_fmt_glsl_format(fmt, fmt->num_components); + + fmt->fourcc = pl_fmt_fourcc(fmt); + + // If no caps, D3D11 only supports this for things we don't care about + if (!fmt->caps) { + pl_free(fmt); + continue; + } + + PL_ARRAY_APPEND(gpu, formats, fmt); + } + + gpu->formats = formats.elem; + gpu->num_formats = formats.num; +} diff --git a/src/d3d11/formats.h b/src/d3d11/formats.h new file mode 100644 index 0000000..08336c0 --- /dev/null +++ b/src/d3d11/formats.h @@ -0,0 +1,36 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +struct d3d_format { + DXGI_FORMAT dxfmt; + int minor; // The D3D11 minor version number which supports this format + struct pl_fmt_t fmt; +}; + +extern const struct d3d_format pl_d3d11_formats[]; + +static inline DXGI_FORMAT fmt_to_dxgi(pl_fmt fmt) +{ + const struct d3d_format **fmtp = PL_PRIV(fmt); + return (*fmtp)->dxfmt; +} + +void pl_d3d11_setup_formats(struct pl_gpu_t *gpu); diff --git a/src/d3d11/gpu.c b/src/d3d11/gpu.c new file mode 100644 index 0000000..05a08a3 --- /dev/null +++ b/src/d3d11/gpu.c @@ -0,0 +1,685 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <initguid.h> +#include <windows.h> +#include <versionhelpers.h> + +#include "common.h" +#include "gpu.h" +#include "formats.h" +#include "glsl/spirv.h" + +#define DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES (0x8) + +struct timer_query { + ID3D11Query *ts_start; + ID3D11Query *ts_end; + ID3D11Query *disjoint; +}; + +struct pl_timer_t { + // Ring buffer of timer queries to use + int current; + int pending; + struct timer_query queries[16]; +}; + +void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + + if (!timer) + return; + struct timer_query *query = &timer->queries[timer->current]; + + // Create the query objects lazilly + if (!query->ts_start) { + D3D(ID3D11Device_CreateQuery(p->dev, + &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_start)); + D3D(ID3D11Device_CreateQuery(p->dev, + &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_end)); + + // Measuring duration in D3D11 requires three queries: start and end + // timestamp queries, and a disjoint query containing a flag which says + // whether the timestamps are usable or if a discontinuity occurred + // between them, like a change in power state or clock speed. The + // disjoint query also contains the timer frequency, so the timestamps + // are useless without it. + D3D(ID3D11Device_CreateQuery(p->dev, + &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &query->disjoint)); + } + + // Query the start timestamp + ID3D11DeviceContext_Begin(p->imm, (ID3D11Asynchronous *) query->disjoint); + ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_start); + return; + +error: + SAFE_RELEASE(query->ts_start); + SAFE_RELEASE(query->ts_end); + SAFE_RELEASE(query->disjoint); +} + +void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + + if (!timer) + return; + struct timer_query *query = &timer->queries[timer->current]; + + // Even if timer_start and timer_end are called in-order, timer_start might + // have failed to create the timer objects + if (!query->ts_start) + return; + + // Query the end timestamp + ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_end); + ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->disjoint); + + // Advance to the next set of queries, for the next call to timer_start + timer->current++; + if (timer->current >= PL_ARRAY_SIZE(timer->queries)) + timer->current = 0; // Wrap around + + // Increment the number of pending queries, unless the ring buffer is full, + // in which case, timer->current now points to the oldest one, which will be + // dropped and reused + if (timer->pending < PL_ARRAY_SIZE(timer->queries)) + timer->pending++; +} + +static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq) +{ + static const uint64_t ns_per_s = 1000000000llu; + return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq; +} + +static uint64_t d3d11_timer_query(pl_gpu gpu, pl_timer timer) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + HRESULT hr; + + for (; timer->pending > 0; timer->pending--) { + int index = timer->current - timer->pending; + if (index < 0) + index += PL_ARRAY_SIZE(timer->queries); + struct timer_query *query = &timer->queries[index]; + + UINT64 start, end; + D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj; + + // Fetch the results of each query, or on S_FALSE, return 0 to indicate + // the queries are still pending + D3D(hr = ID3D11DeviceContext_GetData(p->imm, + (ID3D11Asynchronous *) query->disjoint, &dj, sizeof(dj), + D3D11_ASYNC_GETDATA_DONOTFLUSH)); + if (hr == S_FALSE) + return 0; + D3D(hr = ID3D11DeviceContext_GetData(p->imm, + (ID3D11Asynchronous *) query->ts_end, &end, sizeof(end), + D3D11_ASYNC_GETDATA_DONOTFLUSH)); + if (hr == S_FALSE) + return 0; + D3D(hr = ID3D11DeviceContext_GetData(p->imm, + (ID3D11Asynchronous *) query->ts_start, &start, sizeof(start), + D3D11_ASYNC_GETDATA_DONOTFLUSH)); + if (hr == S_FALSE) + return 0; + + // There was a discontinuity during the queries, so a timestamp can't be + // produced. Skip it and try the next one. + if (dj.Disjoint || !dj.Frequency) + continue; + + // We got a result. Return it to the caller. + timer->pending--; + pl_d3d11_flush_message_queue(ctx, "After timer query"); + + uint64_t ns = timestamp_to_ns(end - start, dj.Frequency); + return PL_MAX(ns, 1); + + error: + // There was an error fetching the timer result, so skip it and try the + // next one + continue; + } + + // No more unprocessed results + return 0; +} + +static void d3d11_timer_destroy(pl_gpu gpu, pl_timer timer) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + + for (int i = 0; i < PL_ARRAY_SIZE(timer->queries); i++) { + SAFE_RELEASE(timer->queries[i].ts_start); + SAFE_RELEASE(timer->queries[i].ts_end); + SAFE_RELEASE(timer->queries[i].disjoint); + } + + pl_d3d11_flush_message_queue(ctx, "After timer destroy"); + + pl_free(timer); +} + +static pl_timer d3d11_timer_create(pl_gpu gpu) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + if (!p->has_timestamp_queries) + return NULL; + + struct pl_timer_t *timer = pl_alloc_ptr(NULL, timer); + *timer = (struct pl_timer_t) {0}; + return timer; +} + +static int d3d11_desc_namespace(pl_gpu gpu, enum pl_desc_type type) +{ + // Vulkan-style binding, where all descriptors are in the same namespace, is + // required to use SPIRV-Cross' HLSL resource mapping API, which targets + // resources by binding number + return 0; +} + +static void d3d11_gpu_flush(pl_gpu gpu) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + ID3D11DeviceContext_Flush(p->imm); + + pl_d3d11_flush_message_queue(ctx, "After gpu flush"); +} + +static void d3d11_gpu_finish(pl_gpu gpu) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + HRESULT hr; + + if (p->finish_fence) { + p->finish_value++; + D3D(ID3D11Fence_SetEventOnCompletion(p->finish_fence, p->finish_value, + p->finish_event)); + ID3D11DeviceContext4_Signal(p->imm4, p->finish_fence, p->finish_value); + ID3D11DeviceContext_Flush(p->imm); + WaitForSingleObject(p->finish_event, INFINITE); + } else { + ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) p->finish_query); + + // D3D11 doesn't have blocking queries, but it does have blocking + // readback. As a performance hack to try to avoid polling, do a dummy + // copy/readback between two buffers. Hopefully this will block until + // all prior commands are finished. If it does, the first GetData call + // will return a result and we won't have to poll. + pl_buf_copy(gpu, p->finish_buf_dst, 0, p->finish_buf_src, 0, sizeof(uint32_t)); + pl_buf_read(gpu, p->finish_buf_dst, 0, &(uint32_t) {0}, sizeof(uint32_t)); + + // Poll the event query until it completes + for (;;) { + BOOL idle; + D3D(hr = ID3D11DeviceContext_GetData(p->imm, + (ID3D11Asynchronous *) p->finish_query, &idle, sizeof(idle), 0)); + if (hr == S_OK && idle) + break; + Sleep(1); + } + } + + pl_d3d11_flush_message_queue(ctx, "After gpu finish"); + +error: + return; +} + +static bool d3d11_gpu_is_failed(pl_gpu gpu) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + + if (ctx->is_failed) + return true; + + // GetDeviceRemovedReason returns S_OK if the device isn't removed + HRESULT hr = ID3D11Device_GetDeviceRemovedReason(p->dev); + if (FAILED(hr)) { + ctx->is_failed = true; + pl_d3d11_after_error(ctx, hr); + } + + return ctx->is_failed; +} + +static void d3d11_gpu_destroy(pl_gpu gpu) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + + pl_buf_destroy(gpu, &p->finish_buf_src); + pl_buf_destroy(gpu, &p->finish_buf_dst); + + // Release everything except the immediate context + SAFE_RELEASE(p->dev); + SAFE_RELEASE(p->dev1); + SAFE_RELEASE(p->dev5); + SAFE_RELEASE(p->imm1); + SAFE_RELEASE(p->imm4); + SAFE_RELEASE(p->vbuf.buf); + SAFE_RELEASE(p->ibuf.buf); + SAFE_RELEASE(p->rstate); + SAFE_RELEASE(p->dsstate); + for (int i = 0; i < PL_TEX_SAMPLE_MODE_COUNT; i++) { + for (int j = 0; j < PL_TEX_ADDRESS_MODE_COUNT; j++) { + SAFE_RELEASE(p->samplers[i][j]); + } + } + SAFE_RELEASE(p->finish_fence); + if (p->finish_event) + CloseHandle(p->finish_event); + SAFE_RELEASE(p->finish_query); + + // Destroy the immediate context synchronously so referenced objects don't + // show up in the leak check + if (p->imm) { + ID3D11DeviceContext_ClearState(p->imm); + ID3D11DeviceContext_Flush(p->imm); + SAFE_RELEASE(p->imm); + } + + pl_spirv_destroy(&p->spirv); + pl_free((void *) gpu); +} + +pl_d3d11 pl_d3d11_get(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->destroy == d3d11_gpu_destroy) { + struct pl_gpu_d3d11 *p = (struct pl_gpu_d3d11 *) impl; + return p->ctx->d3d11; + } + + return NULL; +} + +static bool load_d3d_compiler(pl_gpu gpu) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + HMODULE d3dcompiler = NULL; + + static const struct { + const wchar_t *name; + bool inbox; + } compiler_dlls[] = { + // Try the inbox D3DCompiler first (Windows 8.1 and up) + { .name = L"d3dcompiler_47.dll", .inbox = true }, + // Check for a packaged version of d3dcompiler_47.dll + { .name = L"d3dcompiler_47.dll" }, + // Try d3dcompiler_46.dll from the Windows 8 SDK + { .name = L"d3dcompiler_46.dll" }, + // Try d3dcompiler_43.dll from the June 2010 DirectX SDK + { .name = L"d3dcompiler_43.dll" }, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(compiler_dlls); i++) { + if (compiler_dlls[i].inbox) { + if (!IsWindows8Point1OrGreater()) + continue; + d3dcompiler = LoadLibraryExW(compiler_dlls[i].name, NULL, + LOAD_LIBRARY_SEARCH_SYSTEM32); + } else { + d3dcompiler = LoadLibraryW(compiler_dlls[i].name); + } + if (!d3dcompiler) + continue; + + p->D3DCompile = (void *) GetProcAddress(d3dcompiler, "D3DCompile"); + if (!p->D3DCompile) + return false; + p->d3d_compiler_ver = pl_get_dll_version(compiler_dlls[i].name); + + return true; + } + + return false; +} + +static struct pl_gpu_fns pl_fns_d3d11 = { + .tex_create = pl_d3d11_tex_create, + .tex_destroy = pl_d3d11_tex_destroy, + .tex_invalidate = pl_d3d11_tex_invalidate, + .tex_clear_ex = pl_d3d11_tex_clear_ex, + .tex_blit = pl_d3d11_tex_blit, + .tex_upload = pl_d3d11_tex_upload, + .tex_download = pl_d3d11_tex_download, + .buf_create = pl_d3d11_buf_create, + .buf_destroy = pl_d3d11_buf_destroy, + .buf_write = pl_d3d11_buf_write, + .buf_read = pl_d3d11_buf_read, + .buf_copy = pl_d3d11_buf_copy, + .desc_namespace = d3d11_desc_namespace, + .pass_create = pl_d3d11_pass_create, + .pass_destroy = pl_d3d11_pass_destroy, + .pass_run = pl_d3d11_pass_run, + .timer_create = d3d11_timer_create, + .timer_destroy = d3d11_timer_destroy, + .timer_query = d3d11_timer_query, + .gpu_flush = d3d11_gpu_flush, + .gpu_finish = d3d11_gpu_finish, + .gpu_is_failed = d3d11_gpu_is_failed, + .destroy = d3d11_gpu_destroy, +}; + +pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx) +{ + pl_assert(ctx->dev); + IDXGIDevice1 *dxgi_dev = NULL; + IDXGIAdapter1 *adapter = NULL; + IDXGIAdapter4 *adapter4 = NULL; + bool success = false; + HRESULT hr; + + struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gpu_d3d11); + gpu->log = ctx->log; + + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + uint32_t spirv_ver = PL_MIN(SPV_VERSION, PL_MAX_SPIRV_VER); + *p = (struct pl_gpu_d3d11) { + .ctx = ctx, + .impl = pl_fns_d3d11, + .dev = ctx->dev, + .spirv = pl_spirv_create(ctx->log, (struct pl_spirv_version) { + .env_version = pl_spirv_version_to_vulkan(spirv_ver), + .spv_version = spirv_ver, + }), + .vbuf.bind_flags = D3D11_BIND_VERTEX_BUFFER, + .ibuf.bind_flags = D3D11_BIND_INDEX_BUFFER, + }; + if (!p->spirv) + goto error; + + ID3D11Device_AddRef(p->dev); + ID3D11Device_GetImmediateContext(p->dev, &p->imm); + + // Check D3D11.1 interfaces + hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1, + (void **) &p->dev1); + if (SUCCEEDED(hr)) { + p->minor = 1; + ID3D11Device1_GetImmediateContext1(p->dev1, &p->imm1); + } + + // Check D3D11.4 interfaces + hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device5, + (void **) &p->dev5); + if (SUCCEEDED(hr)) { + // There is no GetImmediateContext4 method + hr = ID3D11DeviceContext_QueryInterface(p->imm, &IID_ID3D11DeviceContext4, + (void **) &p->imm4); + if (SUCCEEDED(hr)) + p->minor = 4; + } + + PL_INFO(gpu, "Using Direct3D 11.%d runtime", p->minor); + + D3D(ID3D11Device_QueryInterface(p->dev, &IID_IDXGIDevice1, (void **) &dxgi_dev)); + D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter)); + + DXGI_ADAPTER_DESC1 adapter_desc = {0}; + IDXGIAdapter1_GetDesc1(adapter, &adapter_desc); + + // No resource can be larger than max_res_size in bytes + unsigned int max_res_size = PL_CLAMP( + D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_B_TERM * adapter_desc.DedicatedVideoMemory, + D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_A_TERM * 1024u * 1024u, + D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_C_TERM * 1024u * 1024u); + + gpu->glsl = (struct pl_glsl_version) { + .version = 450, + .vulkan = true, + }; + + gpu->limits = (struct pl_gpu_limits) { + .max_buf_size = max_res_size, + .max_ssbo_size = max_res_size, + .max_vbo_size = max_res_size, + .align_vertex_stride = 1, + + // Make up some values + .align_tex_xfer_offset = 32, + .align_tex_xfer_pitch = 1, + .fragment_queues = 1, + }; + + p->fl = ID3D11Device_GetFeatureLevel(p->dev); + + // If we're not using FL9_x, we can use the same suballocated buffer as a + // vertex buffer and index buffer + if (p->fl >= D3D_FEATURE_LEVEL_10_0) + p->vbuf.bind_flags |= D3D11_BIND_INDEX_BUFFER; + + if (p->fl >= D3D_FEATURE_LEVEL_10_0) { + gpu->limits.max_ubo_size = D3D11_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * CBUF_ELEM; + } else { + // 10level9 restriction: + // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context + gpu->limits.max_ubo_size = 255 * CBUF_ELEM; + } + + if (p->fl >= D3D_FEATURE_LEVEL_11_0) { + gpu->limits.max_tex_1d_dim = D3D11_REQ_TEXTURE1D_U_DIMENSION; + gpu->limits.max_tex_2d_dim = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION; + gpu->limits.max_tex_3d_dim = D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; + } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) { + gpu->limits.max_tex_1d_dim = D3D10_REQ_TEXTURE1D_U_DIMENSION; + gpu->limits.max_tex_2d_dim = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION; + gpu->limits.max_tex_3d_dim = D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; + } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) { + gpu->limits.max_tex_2d_dim = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION; + // Same limit as FL9_1 + gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; + } else { + gpu->limits.max_tex_2d_dim = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION; + gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; + } + + if (p->fl >= D3D_FEATURE_LEVEL_10_0) { + gpu->limits.max_buffer_texels = + 1 << D3D11_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP; + } + + if (p->fl >= D3D_FEATURE_LEVEL_11_0) { + gpu->glsl.compute = true; + gpu->limits.compute_queues = 1; + // Set `gpu->limits.blittable_1d_3d`, since `pl_tex_blit_compute`, which + // is used to emulate blits on 11_0 and up, supports 1D and 3D textures + gpu->limits.blittable_1d_3d = true; + + gpu->glsl.max_shmem_size = D3D11_CS_TGSM_REGISTER_COUNT * sizeof(float); + gpu->glsl.max_group_threads = D3D11_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP; + gpu->glsl.max_group_size[0] = D3D11_CS_THREAD_GROUP_MAX_X; + gpu->glsl.max_group_size[1] = D3D11_CS_THREAD_GROUP_MAX_Y; + gpu->glsl.max_group_size[2] = D3D11_CS_THREAD_GROUP_MAX_Z; + gpu->limits.max_dispatch[0] = gpu->limits.max_dispatch[1] = + gpu->limits.max_dispatch[2] = + D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION; + } + + if (p->fl >= D3D_FEATURE_LEVEL_11_0) { + // The offset limits are defined by HLSL: + // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4-po--sm5---asm- + gpu->glsl.min_gather_offset = -32; + gpu->glsl.max_gather_offset = 31; + } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) { + // SM4.1 has no gather4_po, so the offset must be specified by an + // immediate with a range of [-8, 7] + // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4--sm4-1---asm- + // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sample--sm4---asm-#address-offset + gpu->glsl.min_gather_offset = -8; + gpu->glsl.max_gather_offset = 7; + } + + if (p->fl >= D3D_FEATURE_LEVEL_10_0) { + p->max_srvs = D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT; + } else { + // 10level9 restriction: + // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context + p->max_srvs = 8; + } + + if (p->fl >= D3D_FEATURE_LEVEL_11_1) { + p->max_uavs = D3D11_1_UAV_SLOT_COUNT; + } else { + p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT; + } + + if (!load_d3d_compiler(gpu)) { + PL_FATAL(gpu, "Could not find D3DCompiler DLL"); + goto error; + } + PL_INFO(gpu, "D3DCompiler version: %u.%u.%u.%u", + p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor, + p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision); + + // Detect support for timestamp queries. Some FL9_x devices don't support them. + hr = ID3D11Device_CreateQuery(p->dev, + &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL); + p->has_timestamp_queries = SUCCEEDED(hr); + + pl_d3d11_setup_formats(gpu); + + // The rasterizer state never changes, so create it here + D3D11_RASTERIZER_DESC rdesc = { + .FillMode = D3D11_FILL_SOLID, + .CullMode = D3D11_CULL_NONE, + .FrontCounterClockwise = FALSE, + .DepthClipEnable = TRUE, // Required for 10level9 + .ScissorEnable = TRUE, + }; + D3D(ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &p->rstate)); + + // The depth stencil state never changes either, and we only set it to turn + // depth testing off so the debug layer doesn't complain about an unbound + // depth buffer + D3D11_DEPTH_STENCIL_DESC dsdesc = { + .DepthEnable = FALSE, + .DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL, + .DepthFunc = D3D11_COMPARISON_LESS, + .StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK, + .StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK, + .FrontFace = { + .StencilFailOp = D3D11_STENCIL_OP_KEEP, + .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP, + .StencilPassOp = D3D11_STENCIL_OP_KEEP, + .StencilFunc = D3D11_COMPARISON_ALWAYS, + }, + .BackFace = { + .StencilFailOp = D3D11_STENCIL_OP_KEEP, + .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP, + .StencilPassOp = D3D11_STENCIL_OP_KEEP, + .StencilFunc = D3D11_COMPARISON_ALWAYS, + }, + }; + D3D(ID3D11Device_CreateDepthStencilState(p->dev, &dsdesc, &p->dsstate)); + + // Initialize the samplers + for (int sample_mode = 0; sample_mode < PL_TEX_SAMPLE_MODE_COUNT; sample_mode++) { + for (int address_mode = 0; address_mode < PL_TEX_ADDRESS_MODE_COUNT; address_mode++) { + static const D3D11_TEXTURE_ADDRESS_MODE d3d_address_mode[] = { + [PL_TEX_ADDRESS_CLAMP] = D3D11_TEXTURE_ADDRESS_CLAMP, + [PL_TEX_ADDRESS_REPEAT] = D3D11_TEXTURE_ADDRESS_WRAP, + [PL_TEX_ADDRESS_MIRROR] = D3D11_TEXTURE_ADDRESS_MIRROR, + }; + static const D3D11_FILTER d3d_filter[] = { + [PL_TEX_SAMPLE_NEAREST] = D3D11_FILTER_MIN_MAG_MIP_POINT, + [PL_TEX_SAMPLE_LINEAR] = D3D11_FILTER_MIN_MAG_MIP_LINEAR, + }; + + D3D11_SAMPLER_DESC sdesc = { + .AddressU = d3d_address_mode[address_mode], + .AddressV = d3d_address_mode[address_mode], + .AddressW = d3d_address_mode[address_mode], + .ComparisonFunc = D3D11_COMPARISON_NEVER, + .MinLOD = 0, + .MaxLOD = D3D11_FLOAT32_MAX, + .MaxAnisotropy = 1, + .Filter = d3d_filter[sample_mode], + }; + D3D(ID3D11Device_CreateSamplerState(p->dev, &sdesc, + &p->samplers[sample_mode][address_mode])); + } + } + + hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter4, + (void **) &adapter4); + if (SUCCEEDED(hr)) { + DXGI_ADAPTER_DESC3 adapter_desc3 = {0}; + IDXGIAdapter4_GetDesc3(adapter4, &adapter_desc3); + + p->has_monitored_fences = + adapter_desc3.Flags & DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES; + } + + // Try to create a D3D11.4 fence object to wait on in pl_gpu_finish() + if (p->dev5 && p->has_monitored_fences) { + hr = ID3D11Device5_CreateFence(p->dev5, 0, D3D11_FENCE_FLAG_NONE, + &IID_ID3D11Fence, + (void **) &p->finish_fence); + if (SUCCEEDED(hr)) { + p->finish_event = CreateEventW(NULL, FALSE, FALSE, NULL); + if (!p->finish_event) { + PL_ERR(gpu, "Failed to create finish() event"); + goto error; + } + } + } + + // If fences are not available, we will have to poll a event query instead + if (!p->finish_fence) { + // Buffers for dummy copy/readback (see d3d11_gpu_finish()) + p->finish_buf_src = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(uint32_t), + .drawable = true, // Make these vertex buffers for 10level9 + .initial_data = &(uint32_t) {0x11223344}, + )); + p->finish_buf_dst = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(uint32_t), + .host_readable = true, + .drawable = true, + )); + + D3D(ID3D11Device_CreateQuery(p->dev, + &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &p->finish_query)); + } + + pl_d3d11_flush_message_queue(ctx, "After gpu create"); + + success = true; +error: + SAFE_RELEASE(dxgi_dev); + SAFE_RELEASE(adapter); + SAFE_RELEASE(adapter4); + if (success) { + return pl_gpu_finalize(gpu); + } else { + d3d11_gpu_destroy(gpu); + return NULL; + } +} diff --git a/src/d3d11/gpu.h b/src/d3d11/gpu.h new file mode 100644 index 0000000..cbc706a --- /dev/null +++ b/src/d3d11/gpu.h @@ -0,0 +1,212 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdalign.h> +#include <d3d11_4.h> +#include <dxgi1_6.h> +#include <d3dcompiler.h> +#include <spirv_cross_c.h> + +#include "../gpu.h" +#include "../glsl/spirv.h" + +#include "common.h" +#include "utils.h" + +pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx); + +// --- pl_gpu internal structs and helpers + +// Size of one constant in a constant buffer +#define CBUF_ELEM (sizeof(float[4])) + +struct d3d_stream_buf { + UINT bind_flags; + ID3D11Buffer *buf; + size_t size; + size_t used; + unsigned int align; +}; + +struct pl_gpu_d3d11 { + struct pl_gpu_fns impl; + struct d3d11_ctx *ctx; + ID3D11Device *dev; + ID3D11Device1 *dev1; + ID3D11Device5 *dev5; + ID3D11DeviceContext *imm; + ID3D11DeviceContext1 *imm1; + ID3D11DeviceContext4 *imm4; + + // The Direct3D 11 minor version number + int minor; + + pl_spirv spirv; + + pD3DCompile D3DCompile; + struct dll_version d3d_compiler_ver; + + // Device capabilities + D3D_FEATURE_LEVEL fl; + bool has_timestamp_queries; + bool has_monitored_fences; + + int max_srvs; + int max_uavs; + + // Streaming vertex and index buffers + struct d3d_stream_buf vbuf; + struct d3d_stream_buf ibuf; + + // Shared rasterizer state + ID3D11RasterizerState *rstate; + + // Shared depth-stencil state + ID3D11DepthStencilState *dsstate; + + // Array of ID3D11SamplerStates for every combination of sample/address modes + ID3D11SamplerState *samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT]; + + // Resources for finish() + ID3D11Fence *finish_fence; + uint64_t finish_value; + HANDLE finish_event; + ID3D11Query *finish_query; + pl_buf finish_buf_src; + pl_buf finish_buf_dst; +}; + +void pl_d3d11_setup_formats(struct pl_gpu_t *gpu); + +void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer); +void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer); + +struct pl_buf_d3d11 { + ID3D11Buffer *buf; + ID3D11Buffer *staging; + ID3D11ShaderResourceView *raw_srv; + ID3D11UnorderedAccessView *raw_uav; + ID3D11ShaderResourceView *texel_srv; + ID3D11UnorderedAccessView *texel_uav; + + char *data; + bool dirty; +}; + +void pl_d3d11_buf_destroy(pl_gpu gpu, pl_buf buf); +pl_buf pl_d3d11_buf_create(pl_gpu gpu, const struct pl_buf_params *params); +void pl_d3d11_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, const void *data, + size_t size); +bool pl_d3d11_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, + size_t size); +void pl_d3d11_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, pl_buf src, + size_t src_offset, size_t size); + +// Ensure a buffer is up-to-date with its system memory mirror before it is used +void pl_d3d11_buf_resolve(pl_gpu gpu, pl_buf buf); + +struct pl_tex_d3d11 { + // res mirrors one of tex1d, tex2d or tex3d for convenience. It does not + // hold an additional reference to the texture object. + ID3D11Resource *res; + + ID3D11Texture1D *tex1d; + ID3D11Texture2D *tex2d; + ID3D11Texture3D *tex3d; + int array_slice; + + // Mirrors one of staging1d, staging2d, or staging3d, and doesn't hold a ref + ID3D11Resource *staging; + + // Staging textures for pl_tex_download + ID3D11Texture1D *staging1d; + ID3D11Texture2D *staging2d; + ID3D11Texture3D *staging3d; + + ID3D11ShaderResourceView *srv; + ID3D11RenderTargetView *rtv; + ID3D11UnorderedAccessView *uav; + + // for tex_upload/download fallback code + pl_fmt texel_fmt; +}; + +void pl_d3d11_tex_destroy(pl_gpu gpu, pl_tex tex); +pl_tex pl_d3d11_tex_create(pl_gpu gpu, const struct pl_tex_params *params); +void pl_d3d11_tex_invalidate(pl_gpu gpu, pl_tex tex); +void pl_d3d11_tex_clear_ex(pl_gpu gpu, pl_tex tex, + const union pl_clear_color color); +void pl_d3d11_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params); +bool pl_d3d11_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params); +bool pl_d3d11_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params); + +// Constant buffer layout used for gl_NumWorkGroups emulation +struct d3d_num_workgroups_buf { + alignas(CBUF_ELEM) uint32_t num_wgs[3]; +}; + +enum { + HLSL_BINDING_NOT_USED = -1, // Slot should always be bound as NULL + HLSL_BINDING_NUM_WORKGROUPS = -2, // Slot used for gl_NumWorkGroups emulation +}; + +// Represents a specific shader stage in a pl_pass (VS, PS, CS) +struct d3d_pass_stage { + // Lists for each resource type, to simplify binding in pl_pass_run. Indexes + // match the index of the arrays passed to the ID3D11DeviceContext methods. + // Entries are the index of pass->params.descriptors which should be bound + // in that position, or a HLSL_BINDING_* special value. + PL_ARRAY(int) cbvs; + PL_ARRAY(int) srvs; + PL_ARRAY(int) samplers; +}; + +struct pl_pass_d3d11 { + ID3D11PixelShader *ps; + ID3D11VertexShader *vs; + ID3D11ComputeShader *cs; + ID3D11InputLayout *layout; + ID3D11BlendState *bstate; + + // gl_NumWorkGroups emulation + struct d3d_num_workgroups_buf last_num_wgs; + ID3D11Buffer *num_workgroups_buf; + bool num_workgroups_used; + + // Maximum binding number + int max_binding; + + struct d3d_pass_stage main; // PS and CS + struct d3d_pass_stage vertex; + + // List of resources, as in `struct pass_stage`, except UAVs are shared + // between all shader stages + PL_ARRAY(int) uavs; + + // Pre-allocated resource arrays to use in pl_pass_run + ID3D11Buffer **cbv_arr; + ID3D11ShaderResourceView **srv_arr; + ID3D11SamplerState **sampler_arr; + ID3D11UnorderedAccessView **uav_arr; +}; + +void pl_d3d11_pass_destroy(pl_gpu gpu, pl_pass pass); +const struct pl_pass_t *pl_d3d11_pass_create(pl_gpu gpu, + const struct pl_pass_params *params); +void pl_d3d11_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params); diff --git a/src/d3d11/gpu_buf.c b/src/d3d11/gpu_buf.c new file mode 100644 index 0000000..955e6e1 --- /dev/null +++ b/src/d3d11/gpu_buf.c @@ -0,0 +1,310 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "formats.h" + +void pl_d3d11_buf_destroy(pl_gpu gpu, pl_buf buf) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_buf_d3d11 *buf_p = PL_PRIV(buf); + + SAFE_RELEASE(buf_p->buf); + SAFE_RELEASE(buf_p->staging); + SAFE_RELEASE(buf_p->raw_srv); + SAFE_RELEASE(buf_p->raw_uav); + SAFE_RELEASE(buf_p->texel_srv); + SAFE_RELEASE(buf_p->texel_uav); + + pl_d3d11_flush_message_queue(ctx, "After buffer destroy"); + + pl_free((void *) buf); +} + +pl_buf pl_d3d11_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + + struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_d3d11); + buf->params = *params; + buf->params.initial_data = NULL; + + struct pl_buf_d3d11 *buf_p = PL_PRIV(buf); + + D3D11_BUFFER_DESC desc = { .ByteWidth = params->size }; + + if (params->uniform && !params->format && + (params->storable || params->drawable)) + { + // TODO: Figure out what to do with these + PL_ERR(gpu, "Uniform buffers cannot share any other buffer type"); + goto error; + } + + // TODO: Distinguish between uniform buffers and texel uniform buffers. + // Currently we assume that if uniform and format are set, it's a texel + // buffer and NOT a uniform buffer. + if (params->uniform && !params->format) { + desc.BindFlags |= D3D11_BIND_CONSTANT_BUFFER; + desc.ByteWidth = PL_ALIGN2(desc.ByteWidth, CBUF_ELEM); + } + if (params->uniform && params->format) { + desc.BindFlags |= D3D11_BIND_SHADER_RESOURCE; + } + if (params->storable) { + desc.BindFlags |= D3D11_BIND_UNORDERED_ACCESS + | D3D11_BIND_SHADER_RESOURCE; + desc.ByteWidth = PL_ALIGN2(desc.ByteWidth, sizeof(float)); + desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; + } + if (params->drawable) { + desc.BindFlags |= D3D11_BIND_VERTEX_BUFFER; + + // In FL9_x, a vertex buffer can't also be an index buffer, so index + // buffers are unsupported in FL9_x for now + if (p->fl > D3D_FEATURE_LEVEL_9_3) + desc.BindFlags |= D3D11_BIND_INDEX_BUFFER; + } + + char *data = NULL; + + // D3D11 doesn't allow partial constant buffer updates without special + // conditions. To support partial buffer updates, keep a mirror of the + // buffer data in system memory and upload the whole thing before the buffer + // is used. + // + // Note: We don't use a staging buffer for this because of Intel. + // https://github.com/mpv-player/mpv/issues/5293 + // https://crbug.com/593024 + if (params->uniform && !params->format && params->host_writable) { + data = pl_zalloc(buf, desc.ByteWidth); + buf_p->data = data; + } + + D3D11_SUBRESOURCE_DATA srdata = { 0 }; + if (params->initial_data) { + if (desc.ByteWidth != params->size) { + // If the size had to be rounded-up, uploading from + // params->initial_data is technically undefined behavior, so copy + // the initial data to an allocation first + if (!data) + data = pl_zalloc(buf, desc.ByteWidth); + srdata.pSysMem = data; + } else { + srdata.pSysMem = params->initial_data; + } + + if (data) + memcpy(data, params->initial_data, params->size); + } + + D3D(ID3D11Device_CreateBuffer(p->dev, &desc, + params->initial_data ? &srdata : NULL, + &buf_p->buf)); + + if (!buf_p->data) + pl_free(data); + + // Create raw views for PL_DESC_BUF_STORAGE + if (params->storable) { + // A SRV is used for PL_DESC_ACCESS_READONLY + D3D11_SHADER_RESOURCE_VIEW_DESC sdesc = { + .Format = DXGI_FORMAT_R32_TYPELESS, + .ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX, + .BufferEx = { + .NumElements = + PL_ALIGN2(buf->params.size, sizeof(float)) / sizeof(float), + .Flags = D3D11_BUFFEREX_SRV_FLAG_RAW, + }, + }; + D3D(ID3D11Device_CreateShaderResourceView(p->dev, + (ID3D11Resource *) buf_p->buf, &sdesc, &buf_p->raw_srv)); + + // A UAV is used for all other access modes + D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = { + .Format = DXGI_FORMAT_R32_TYPELESS, + .ViewDimension = D3D11_UAV_DIMENSION_BUFFER, + .Buffer = { + .NumElements = + PL_ALIGN2(buf->params.size, sizeof(float)) / sizeof(float), + .Flags = D3D11_BUFFER_UAV_FLAG_RAW, + }, + }; + D3D(ID3D11Device_CreateUnorderedAccessView(p->dev, + (ID3D11Resource *) buf_p->buf, &udesc, &buf_p->raw_uav)); + } + + // Create a typed SRV for PL_BUF_TEXEL_UNIFORM and PL_BUF_TEXEL_STORAGE + if (params->format) { + if (params->uniform) { + D3D11_SHADER_RESOURCE_VIEW_DESC sdesc = { + .Format = fmt_to_dxgi(params->format), + .ViewDimension = D3D11_SRV_DIMENSION_BUFFER, + .Buffer = { + .NumElements = + PL_ALIGN(buf->params.size, buf->params.format->texel_size) + / buf->params.format->texel_size, + }, + }; + D3D(ID3D11Device_CreateShaderResourceView(p->dev, + (ID3D11Resource *) buf_p->buf, &sdesc, &buf_p->texel_srv)); + } + + // Create a typed UAV for PL_BUF_TEXEL_STORAGE + if (params->storable) { + D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = { + .Format = fmt_to_dxgi(buf->params.format), + .ViewDimension = D3D11_UAV_DIMENSION_BUFFER, + .Buffer = { + .NumElements = + PL_ALIGN(buf->params.size, buf->params.format->texel_size) + / buf->params.format->texel_size, + }, + }; + D3D(ID3D11Device_CreateUnorderedAccessView(p->dev, + (ID3D11Resource *) buf_p->buf, &udesc, &buf_p->texel_uav)); + } + } + + + if (!buf_p->data) { + // Create the staging buffer regardless of whether params->host_readable + // is set or not, so that buf_copy can copy to system-memory-backed + // buffers + // TODO: Consider sharing a big staging buffer for this, rather than + // having one staging buffer per buffer + desc.BindFlags = 0; + desc.MiscFlags = 0; + desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc.Usage = D3D11_USAGE_STAGING; + D3D(ID3D11Device_CreateBuffer(p->dev, &desc, NULL, &buf_p->staging)); + } + + pl_d3d11_flush_message_queue(ctx, "After buffer create"); + + return buf; + +error: + pl_d3d11_buf_destroy(gpu, buf); + return NULL; +} + +void pl_d3d11_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, const void *data, + size_t size) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct pl_buf_d3d11 *buf_p = PL_PRIV(buf); + + if (buf_p->data) { + memcpy(buf_p->data + offset, data, size); + buf_p->dirty = true; + } else { + ID3D11DeviceContext_UpdateSubresource(p->imm, + (ID3D11Resource *) buf_p->buf, 0, (&(D3D11_BOX) { + .left = offset, + .top = 0, + .front = 0, + .right = offset + size, + .bottom = 1, + .back = 1, + }), data, 0, 0); + } +} + +void pl_d3d11_buf_resolve(pl_gpu gpu, pl_buf buf) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct pl_buf_d3d11 *buf_p = PL_PRIV(buf); + + if (!buf_p->data || !buf_p->dirty) + return; + + ID3D11DeviceContext_UpdateSubresource(p->imm, (ID3D11Resource *) buf_p->buf, + 0, NULL, buf_p->data, 0, 0); +} + +bool pl_d3d11_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, + size_t size) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_buf_d3d11 *buf_p = PL_PRIV(buf); + + // If there is a system-memory mirror of the buffer contents, use it + if (buf_p->data) { + memcpy(dest, buf_p->data + offset, size); + return true; + } + + ID3D11DeviceContext_CopyResource(p->imm, (ID3D11Resource *) buf_p->staging, + (ID3D11Resource *) buf_p->buf); + + D3D11_MAPPED_SUBRESOURCE lock; + D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) buf_p->staging, 0, + D3D11_MAP_READ, 0, &lock)); + + char *csrc = lock.pData; + memcpy(dest, csrc + offset, size); + + ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource *) buf_p->staging, 0); + + pl_d3d11_flush_message_queue(ctx, "After buffer read"); + + return true; + +error: + return false; +} + +void pl_d3d11_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, pl_buf src, + size_t src_offset, size_t size) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_buf_d3d11 *src_p = PL_PRIV(src); + struct pl_buf_d3d11 *dst_p = PL_PRIV(dst); + + // Handle system memory copies in case one or both of the buffers has a + // system memory mirror + if (src_p->data && dst_p->data) { + memcpy(dst_p->data + dst_offset, src_p->data + src_offset, size); + dst_p->dirty = true; + } else if (src_p->data) { + pl_d3d11_buf_write(gpu, dst, dst_offset, src_p->data + src_offset, size); + } else if (dst_p->data) { + if (pl_d3d11_buf_read(gpu, src, src_offset, dst_p->data + dst_offset, size)) { + dst_p->dirty = true; + } else { + PL_ERR(gpu, "Failed to read from GPU during buffer copy"); + } + } else { + ID3D11DeviceContext_CopySubresourceRegion(p->imm, + (ID3D11Resource *) dst_p->buf, 0, dst_offset, 0, 0, + (ID3D11Resource *) src_p->buf, 0, (&(D3D11_BOX) { + .left = src_offset, + .top = 0, + .front = 0, + .right = src_offset + size, + .bottom = 1, + .back = 1, + })); + } + + pl_d3d11_flush_message_queue(ctx, "After buffer copy"); +} diff --git a/src/d3d11/gpu_pass.c b/src/d3d11/gpu_pass.c new file mode 100644 index 0000000..0e46ccd --- /dev/null +++ b/src/d3d11/gpu_pass.c @@ -0,0 +1,1293 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "formats.h" +#include "glsl/spirv.h" +#include "../cache.h" + +struct stream_buf_slice { + const void *data; + unsigned int size; + unsigned int offset; +}; + +// Upload one or more slices of single-use data to a suballocated dynamic +// buffer. Only call this once per-buffer per-pass, since it will discard or +// reallocate the buffer when full. +static bool stream_buf_upload(pl_gpu gpu, struct d3d_stream_buf *stream, + struct stream_buf_slice *slices, int num_slices) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + unsigned int align = PL_DEF(stream->align, sizeof(float)); + + // Get total size, rounded up to the buffer's alignment + size_t size = 0; + for (int i = 0; i < num_slices; i++) + size += PL_ALIGN2(slices[i].size, align); + + if (size > gpu->limits.max_buf_size) { + PL_ERR(gpu, "Streaming buffer is too large"); + return -1; + } + + // If the data doesn't fit, realloc the buffer + if (size > stream->size) { + size_t new_size = stream->size; + // Arbitrary base size + if (!new_size) + new_size = 16 * 1024; + while (new_size < size) + new_size *= 2; + new_size = PL_MIN(new_size, gpu->limits.max_buf_size); + + ID3D11Buffer *new_buf; + D3D11_BUFFER_DESC vbuf_desc = { + .ByteWidth = new_size, + .Usage = D3D11_USAGE_DYNAMIC, + .BindFlags = stream->bind_flags, + .CPUAccessFlags = D3D11_CPU_ACCESS_WRITE, + }; + D3D(ID3D11Device_CreateBuffer(p->dev, &vbuf_desc, NULL, &new_buf)); + + SAFE_RELEASE(stream->buf); + stream->buf = new_buf; + stream->size = new_size; + stream->used = 0; + } + + bool discard = false; + size_t offset = stream->used; + if (offset + size > stream->size) { + // We reached the end of the buffer, so discard and wrap around + discard = true; + offset = 0; + } + + D3D11_MAPPED_SUBRESOURCE map = {0}; + UINT type = discard ? D3D11_MAP_WRITE_DISCARD : D3D11_MAP_WRITE_NO_OVERWRITE; + D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) stream->buf, 0, type, + 0, &map)); + + // Upload each slice + char *cdata = map.pData; + stream->used = offset; + for (int i = 0; i < num_slices; i++) { + slices[i].offset = stream->used; + memcpy(cdata + slices[i].offset, slices[i].data, slices[i].size); + stream->used += PL_ALIGN2(slices[i].size, align); + } + + ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource *) stream->buf, 0); + + return true; + +error: + return false; +} + +static const char *get_shader_target(pl_gpu gpu, enum glsl_shader_stage stage) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + switch (p->fl) { + default: + switch (stage) { + case GLSL_SHADER_VERTEX: return "vs_5_0"; + case GLSL_SHADER_FRAGMENT: return "ps_5_0"; + case GLSL_SHADER_COMPUTE: return "cs_5_0"; + } + break; + case D3D_FEATURE_LEVEL_10_1: + switch (stage) { + case GLSL_SHADER_VERTEX: return "vs_4_1"; + case GLSL_SHADER_FRAGMENT: return "ps_4_1"; + case GLSL_SHADER_COMPUTE: return "cs_4_1"; + } + break; + case D3D_FEATURE_LEVEL_10_0: + switch (stage) { + case GLSL_SHADER_VERTEX: return "vs_4_0"; + case GLSL_SHADER_FRAGMENT: return "ps_4_0"; + case GLSL_SHADER_COMPUTE: return "cs_4_0"; + } + break; + case D3D_FEATURE_LEVEL_9_3: + switch (stage) { + case GLSL_SHADER_VERTEX: return "vs_4_0_level_9_3"; + case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_3"; + case GLSL_SHADER_COMPUTE: return NULL; + } + break; + case D3D_FEATURE_LEVEL_9_2: + case D3D_FEATURE_LEVEL_9_1: + switch (stage) { + case GLSL_SHADER_VERTEX: return "vs_4_0_level_9_1"; + case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_1"; + case GLSL_SHADER_COMPUTE: return NULL; + } + break; + } + return NULL; +} + +static SpvExecutionModel stage_to_spv(enum glsl_shader_stage stage) +{ + static const SpvExecutionModel spv_execution_model[] = { + [GLSL_SHADER_VERTEX] = SpvExecutionModelVertex, + [GLSL_SHADER_FRAGMENT] = SpvExecutionModelFragment, + [GLSL_SHADER_COMPUTE] = SpvExecutionModelGLCompute, + }; + return spv_execution_model[stage]; +} + +#define SC(cmd) \ + do { \ + spvc_result res = (cmd); \ + if (res != SPVC_SUCCESS) { \ + PL_ERR(gpu, "%s: %s (%d) (%s:%d)", \ + #cmd, sc ? spvc_context_get_last_error_string(sc) : "", \ + res, __FILE__, __LINE__); \ + goto error; \ + } \ + } while (0) + +// Some decorations, like SpvDecorationNonWritable, are actually found on the +// members of a buffer block, rather than the buffer block itself. If all +// members have a certain decoration, SPIRV-Cross considers it to apply to the +// buffer block too, which determines things like whether a SRV or UAV is used +// for an SSBO. This function checks if SPIRV-Cross considers a decoration to +// apply to a buffer block. +static spvc_result buffer_block_has_decoration(spvc_compiler sc_comp, + spvc_variable_id id, + SpvDecoration decoration, + bool *out) +{ + const SpvDecoration *decorations; + size_t num_decorations = 0; + + spvc_result res = spvc_compiler_get_buffer_block_decorations(sc_comp, id, + &decorations, &num_decorations); + if (res != SPVC_SUCCESS) + return res; + + for (size_t j = 0; j < num_decorations; j++) { + if (decorations[j] == decoration) { + *out = true; + return res; + } + } + + *out = false; + return res; +} + +static bool alloc_hlsl_reg_bindings(pl_gpu gpu, pl_pass pass, + struct d3d_pass_stage *pass_s, + spvc_context sc, + spvc_compiler sc_comp, + spvc_resources resources, + spvc_resource_type res_type, + enum glsl_shader_stage stage) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + const spvc_reflected_resource *res_list; + size_t res_count; + + SC(spvc_resources_get_resource_list_for_type(resources, res_type, + &res_list, &res_count)); + + // In a raster pass, one of the UAV slots is used by the runtime for the RTV + int uav_offset = stage == GLSL_SHADER_COMPUTE ? 0 : 1; + int max_uavs = p->max_uavs - uav_offset; + + for (int i = 0; i < res_count; i++) { + unsigned int binding = spvc_compiler_get_decoration(sc_comp, + res_list[i].id, SpvDecorationBinding); + unsigned int descriptor_set = spvc_compiler_get_decoration(sc_comp, + res_list[i].id, SpvDecorationDescriptorSet); + if (descriptor_set != 0) + continue; + + pass_p->max_binding = PL_MAX(pass_p->max_binding, binding); + + spvc_hlsl_resource_binding hlslbind; + spvc_hlsl_resource_binding_init(&hlslbind); + hlslbind.stage = stage_to_spv(stage); + hlslbind.binding = binding; + hlslbind.desc_set = descriptor_set; + + bool has_cbv = false, has_sampler = false, has_srv = false, has_uav = false; + switch (res_type) { + case SPVC_RESOURCE_TYPE_UNIFORM_BUFFER: + has_cbv = true; + break; + case SPVC_RESOURCE_TYPE_STORAGE_BUFFER:; + bool non_writable_bb = false; + SC(buffer_block_has_decoration(sc_comp, res_list[i].id, + SpvDecorationNonWritable, &non_writable_bb)); + if (non_writable_bb) { + has_srv = true; + } else { + has_uav = true; + } + break; + case SPVC_RESOURCE_TYPE_STORAGE_IMAGE:; + bool non_writable = spvc_compiler_has_decoration(sc_comp, + res_list[i].id, SpvDecorationNonWritable); + if (non_writable) { + has_srv = true; + } else { + has_uav = true; + } + break; + case SPVC_RESOURCE_TYPE_SEPARATE_IMAGE: + has_srv = true; + break; + case SPVC_RESOURCE_TYPE_SAMPLED_IMAGE:; + spvc_type type = spvc_compiler_get_type_handle(sc_comp, + res_list[i].type_id); + SpvDim dimension = spvc_type_get_image_dimension(type); + // Uniform texel buffers are technically sampled images, but they + // aren't sampled from, so don't allocate a sampler + if (dimension != SpvDimBuffer) + has_sampler = true; + has_srv = true; + break; + default: + break; + } + + if (has_cbv) { + hlslbind.cbv.register_binding = pass_s->cbvs.num; + PL_ARRAY_APPEND(pass, pass_s->cbvs, binding); + if (pass_s->cbvs.num > D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) { + PL_ERR(gpu, "Too many constant buffers in shader"); + goto error; + } + } + + if (has_sampler) { + hlslbind.sampler.register_binding = pass_s->samplers.num; + PL_ARRAY_APPEND(pass, pass_s->samplers, binding); + if (pass_s->samplers.num > D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT) { + PL_ERR(gpu, "Too many samplers in shader"); + goto error; + } + } + + if (has_srv) { + hlslbind.srv.register_binding = pass_s->srvs.num; + PL_ARRAY_APPEND(pass, pass_s->srvs, binding); + if (pass_s->srvs.num > p->max_srvs) { + PL_ERR(gpu, "Too many SRVs in shader"); + goto error; + } + } + + if (has_uav) { + // UAV registers are shared between the vertex and fragment shaders + // in a raster pass, so check if the UAV for this resource has + // already been allocated + bool uav_bound = false; + for (int j = 0; j < pass_p->uavs.num; j++) { + if (pass_p->uavs.elem[j] == binding) { + uav_bound = true; + break; + } + } + + if (!uav_bound) { + hlslbind.uav.register_binding = pass_p->uavs.num + uav_offset; + PL_ARRAY_APPEND(pass, pass_p->uavs, binding); + if (pass_p->uavs.num > max_uavs) { + PL_ERR(gpu, "Too many UAVs in shader"); + goto error; + } + } + } + + SC(spvc_compiler_hlsl_add_resource_binding(sc_comp, &hlslbind)); + } + + return true; +error: + return false; +} + +static const char *shader_names[] = { + [GLSL_SHADER_VERTEX] = "vertex", + [GLSL_SHADER_FRAGMENT] = "fragment", + [GLSL_SHADER_COMPUTE] = "compute", +}; + +static ID3DBlob *shader_compile_glsl(pl_gpu gpu, pl_pass pass, + struct d3d_pass_stage *pass_s, + enum glsl_shader_stage stage, + const char *glsl) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + void *tmp = pl_tmp(NULL); + spvc_context sc = NULL; + spvc_compiler sc_comp = NULL; + const char *hlsl = NULL; + ID3DBlob *out = NULL; + ID3DBlob *errors = NULL; + HRESULT hr; + + pl_clock_t start = pl_clock_now(); + pl_str spirv = pl_spirv_compile_glsl(p->spirv, tmp, gpu->glsl, stage, glsl); + if (!spirv.len) + goto error; + + pl_clock_t after_glsl = pl_clock_now(); + pl_log_cpu_time(gpu->log, start, after_glsl, "translating GLSL to SPIR-V"); + + SC(spvc_context_create(&sc)); + + spvc_parsed_ir sc_ir; + SC(spvc_context_parse_spirv(sc, (SpvId *) spirv.buf, + spirv.len / sizeof(SpvId), &sc_ir)); + + SC(spvc_context_create_compiler(sc, SPVC_BACKEND_HLSL, sc_ir, + SPVC_CAPTURE_MODE_TAKE_OWNERSHIP, + &sc_comp)); + + spvc_compiler_options sc_opts; + SC(spvc_compiler_create_compiler_options(sc_comp, &sc_opts)); + + int sc_shader_model; + if (p->fl >= D3D_FEATURE_LEVEL_11_0) { + sc_shader_model = 50; + } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) { + sc_shader_model = 41; + } else { + sc_shader_model = 40; + } + + SC(spvc_compiler_options_set_uint(sc_opts, + SPVC_COMPILER_OPTION_HLSL_SHADER_MODEL, sc_shader_model)); + + // Unlike Vulkan and OpenGL, in D3D11, the clip-space is "flipped" with + // respect to framebuffer-space. In other words, if you render to a pixel at + // (0, -1), you have to sample from (0, 1) to get the value back. We unflip + // it by setting the following option, which inserts the equivalent of + // `gl_Position.y = -gl_Position.y` into the vertex shader + if (stage == GLSL_SHADER_VERTEX) { + SC(spvc_compiler_options_set_bool(sc_opts, + SPVC_COMPILER_OPTION_FLIP_VERTEX_Y, SPVC_TRUE)); + } + + // Bind readonly images and imageBuffers as SRVs. This is done because a lot + // of hardware (especially FL11_x hardware) has very poor format support for + // reading values from UAVs. It allows the common case of readonly and + // writeonly images to support more formats, though the less common case of + // readwrite images still requires format support for UAV loads (represented + // by the PL_FMT_CAP_READWRITE cap in libplacebo.) + // + // Note that setting this option comes at the cost of GLSL support. Readonly + // and readwrite images are the same type in GLSL, but SRV and UAV bound + // textures are different types in HLSL, so for example, a GLSL function + // with an image parameter may fail to compile as HLSL if it's called with a + // readonly image and a readwrite image at different call sites. + SC(spvc_compiler_options_set_bool(sc_opts, + SPVC_COMPILER_OPTION_HLSL_NONWRITABLE_UAV_TEXTURE_AS_SRV, SPVC_TRUE)); + + SC(spvc_compiler_install_compiler_options(sc_comp, sc_opts)); + + spvc_set active = NULL; + SC(spvc_compiler_get_active_interface_variables(sc_comp, &active)); + spvc_resources resources = NULL; + SC(spvc_compiler_create_shader_resources_for_active_variables( + sc_comp, &resources, active)); + + // Allocate HLSL registers for each resource type + alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources, + SPVC_RESOURCE_TYPE_SAMPLED_IMAGE, stage); + alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources, + SPVC_RESOURCE_TYPE_SEPARATE_IMAGE, stage); + alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources, + SPVC_RESOURCE_TYPE_UNIFORM_BUFFER, stage); + alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources, + SPVC_RESOURCE_TYPE_STORAGE_BUFFER, stage); + alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources, + SPVC_RESOURCE_TYPE_STORAGE_IMAGE, stage); + + if (stage == GLSL_SHADER_COMPUTE) { + // Check if the gl_NumWorkGroups builtin is used. If it is, we have to + // emulate it with a constant buffer, so allocate it a CBV register. + spvc_variable_id num_workgroups_id = + spvc_compiler_hlsl_remap_num_workgroups_builtin(sc_comp); + if (num_workgroups_id) { + pass_p->num_workgroups_used = true; + + spvc_hlsl_resource_binding binding; + spvc_hlsl_resource_binding_init(&binding); + binding.stage = stage_to_spv(stage); + binding.binding = pass_p->max_binding + 1; + + // Allocate a CBV register for the buffer + binding.cbv.register_binding = pass_s->cbvs.num; + PL_ARRAY_APPEND(pass, pass_s->cbvs, HLSL_BINDING_NUM_WORKGROUPS); + if (pass_s->cbvs.num > + D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) { + PL_ERR(gpu, "Not enough constant buffer slots for gl_NumWorkGroups"); + goto error; + } + + spvc_compiler_set_decoration(sc_comp, num_workgroups_id, + SpvDecorationDescriptorSet, 0); + spvc_compiler_set_decoration(sc_comp, num_workgroups_id, + SpvDecorationBinding, binding.binding); + + SC(spvc_compiler_hlsl_add_resource_binding(sc_comp, &binding)); + } + } + + SC(spvc_compiler_compile(sc_comp, &hlsl)); + + pl_clock_t after_spvc = pl_clock_now(); + pl_log_cpu_time(gpu->log, after_glsl, after_spvc, "translating SPIR-V to HLSL"); + + hr = p->D3DCompile(hlsl, strlen(hlsl), NULL, NULL, NULL, "main", + get_shader_target(gpu, stage), + D3DCOMPILE_SKIP_VALIDATION | D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, &out, + &errors); + if (FAILED(hr)) { + SAFE_RELEASE(out); + PL_ERR(gpu, "D3DCompile failed: %s\n%.*s", pl_hresult_to_str(hr), + (int) ID3D10Blob_GetBufferSize(errors), + (char *) ID3D10Blob_GetBufferPointer(errors)); + goto error; + } + + pl_log_cpu_time(gpu->log, after_spvc, pl_clock_now(), "translating HLSL to DXBC"); + +error:; + if (hlsl) { + int level = out ? PL_LOG_DEBUG : PL_LOG_ERR; + PL_MSG(gpu, level, "%s shader HLSL source:", shader_names[stage]); + pl_msg_source(gpu->log, level, hlsl); + } + + if (sc) + spvc_context_destroy(sc); + SAFE_RELEASE(errors); + pl_free(tmp); + return out; +} + +struct d3d11_cache_header { + uint64_t hash; + bool num_workgroups_used; + int num_main_cbvs; + int num_main_srvs; + int num_main_samplers; + int num_vertex_cbvs; + int num_vertex_srvs; + int num_vertex_samplers; + int num_uavs; + size_t vert_bc_len; + size_t frag_bc_len; + size_t comp_bc_len; +}; + +static inline uint64_t pass_cache_signature(pl_gpu gpu, uint64_t *key, + const struct pl_pass_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + + uint64_t hash = CACHE_KEY_D3D_DXBC; // seed to uniquely identify d3d11 shaders + + pl_hash_merge(&hash, pl_str0_hash(params->glsl_shader)); + if (params->type == PL_PASS_RASTER) + pl_hash_merge(&hash, pl_str0_hash(params->vertex_shader)); + + // store hash based on the shader bodys as the lookup key + if (key) + *key = hash; + + // and add the compiler version information into the verification signature + pl_hash_merge(&hash, p->spirv->signature); + + unsigned spvc_major, spvc_minor, spvc_patch; + spvc_get_version(&spvc_major, &spvc_minor, &spvc_patch); + + pl_hash_merge(&hash, spvc_major); + pl_hash_merge(&hash, spvc_minor); + pl_hash_merge(&hash, spvc_patch); + + pl_hash_merge(&hash, ((uint64_t)p->d3d_compiler_ver.major << 48) + | ((uint64_t)p->d3d_compiler_ver.minor << 32) + | ((uint64_t)p->d3d_compiler_ver.build << 16) + | (uint64_t)p->d3d_compiler_ver.revision); + pl_hash_merge(&hash, p->fl); + + return hash; +} + +static inline size_t cache_payload_size(struct d3d11_cache_header *header) +{ + size_t required = (header->num_main_cbvs + header->num_main_srvs + + header->num_main_samplers + header->num_vertex_cbvs + + header->num_vertex_srvs + header->num_vertex_samplers + + header->num_uavs) * sizeof(int) + header->vert_bc_len + + header->frag_bc_len + header->comp_bc_len; + + return required; +} + +static bool d3d11_use_cached_program(pl_gpu gpu, struct pl_pass_t *pass, + const struct pl_pass_params *params, + pl_cache_obj *obj, uint64_t *out_sig, + pl_str *vert_bc, pl_str *frag_bc, pl_str *comp_bc) +{ + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + const pl_cache gpu_cache = pl_gpu_cache(gpu); + if (!gpu_cache) + return false; + + *out_sig = pass_cache_signature(gpu, &obj->key, params); + if (!pl_cache_get(gpu_cache, obj)) + return false; + + pl_str cache = (pl_str) { obj->data, obj->size }; + if (cache.len < sizeof(struct d3d11_cache_header)) + return false; + + struct d3d11_cache_header *header = (struct d3d11_cache_header *) cache.buf; + cache = pl_str_drop(cache, sizeof(*header)); + + if (header->hash != *out_sig) + return false; + + // determine required cache size before reading anything + size_t required = cache_payload_size(header); + + if (cache.len < required) + return false; + + pass_p->num_workgroups_used = header->num_workgroups_used; + +#define GET_ARRAY(object, name, num_elems) \ + do { \ + PL_ARRAY_MEMDUP(pass, (object)->name, cache.buf, num_elems); \ + cache = pl_str_drop(cache, num_elems * sizeof(*(object)->name.elem)); \ + } while (0) + +#define GET_STAGE_ARRAY(stage, name) \ + GET_ARRAY(&pass_p->stage, name, header->num_##stage##_##name) + + GET_STAGE_ARRAY(main, cbvs); + GET_STAGE_ARRAY(main, srvs); + GET_STAGE_ARRAY(main, samplers); + GET_STAGE_ARRAY(vertex, cbvs); + GET_STAGE_ARRAY(vertex, srvs); + GET_STAGE_ARRAY(vertex, samplers); + GET_ARRAY(pass_p, uavs, header->num_uavs); + +#define GET_SHADER(ptr) \ + do { \ + if (ptr) \ + *ptr = pl_str_take(cache, header->ptr##_len); \ + cache = pl_str_drop(cache, header->ptr##_len); \ + } while (0) + + GET_SHADER(vert_bc); + GET_SHADER(frag_bc); + GET_SHADER(comp_bc); + + return true; +} + +static void d3d11_update_program_cache(pl_gpu gpu, struct pl_pass_t *pass, + uint64_t key, uint64_t sig, + const pl_str *vs_str, const pl_str *ps_str, + const pl_str *cs_str) +{ + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + const pl_cache gpu_cache = pl_gpu_cache(gpu); + if (!gpu_cache) + return; + + struct d3d11_cache_header header = { + .hash = sig, + .num_workgroups_used = pass_p->num_workgroups_used, + .num_main_cbvs = pass_p->main.cbvs.num, + .num_main_srvs = pass_p->main.srvs.num, + .num_main_samplers = pass_p->main.samplers.num, + .num_vertex_cbvs = pass_p->vertex.cbvs.num, + .num_vertex_srvs = pass_p->vertex.srvs.num, + .num_vertex_samplers = pass_p->vertex.samplers.num, + .num_uavs = pass_p->uavs.num, + .vert_bc_len = vs_str ? vs_str->len : 0, + .frag_bc_len = ps_str ? ps_str->len : 0, + .comp_bc_len = cs_str ? cs_str->len : 0, + }; + + size_t cache_size = sizeof(header) + cache_payload_size(&header); + pl_str cache = {0}; + pl_str_append(NULL, &cache, (pl_str){ (uint8_t *) &header, sizeof(header) }); + +#define WRITE_ARRAY(name) pl_str_append(NULL, &cache, \ + (pl_str){ (uint8_t *) pass_p->name.elem, \ + sizeof(*pass_p->name.elem) * pass_p->name.num }) + WRITE_ARRAY(main.cbvs); + WRITE_ARRAY(main.srvs); + WRITE_ARRAY(main.samplers); + WRITE_ARRAY(vertex.cbvs); + WRITE_ARRAY(vertex.srvs); + WRITE_ARRAY(vertex.samplers); + WRITE_ARRAY(uavs); + + if (vs_str) + pl_str_append(NULL, &cache, *vs_str); + + if (ps_str) + pl_str_append(NULL, &cache, *ps_str); + + if (cs_str) + pl_str_append(NULL, &cache, *cs_str); + + pl_assert(cache_size == cache.len); + pl_cache_str(gpu_cache, key, &cache); +} + +void pl_d3d11_pass_destroy(pl_gpu gpu, pl_pass pass) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + + SAFE_RELEASE(pass_p->vs); + SAFE_RELEASE(pass_p->ps); + SAFE_RELEASE(pass_p->cs); + SAFE_RELEASE(pass_p->layout); + SAFE_RELEASE(pass_p->bstate); + SAFE_RELEASE(pass_p->num_workgroups_buf); + + pl_d3d11_flush_message_queue(ctx, "After pass destroy"); + + pl_free((void *) pass); +} + +static bool pass_create_raster(pl_gpu gpu, struct pl_pass_t *pass, + const struct pl_pass_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + ID3DBlob *vs_blob = NULL; + pl_str vs_str = {0}; + ID3DBlob *ps_blob = NULL; + pl_str ps_str = {0}; + D3D11_INPUT_ELEMENT_DESC *in_descs = NULL; + pl_cache_obj obj = {0}; + uint64_t sig = 0; + bool success = false; + + if (d3d11_use_cached_program(gpu, pass, params, &obj, &sig, &vs_str, &ps_str, NULL)) + PL_DEBUG(gpu, "Using cached DXBC shaders"); + + pl_assert((vs_str.len == 0) == (ps_str.len == 0)); + if (vs_str.len == 0) { + vs_blob = shader_compile_glsl(gpu, pass, &pass_p->vertex, + GLSL_SHADER_VERTEX, params->vertex_shader); + if (!vs_blob) + goto error; + + vs_str = (pl_str) { + .buf = ID3D10Blob_GetBufferPointer(vs_blob), + .len = ID3D10Blob_GetBufferSize(vs_blob), + }; + + ps_blob = shader_compile_glsl(gpu, pass, &pass_p->main, + GLSL_SHADER_FRAGMENT, params->glsl_shader); + if (!ps_blob) + goto error; + + ps_str = (pl_str) { + .buf = ID3D10Blob_GetBufferPointer(ps_blob), + .len = ID3D10Blob_GetBufferSize(ps_blob), + }; + } + + D3D(ID3D11Device_CreateVertexShader(p->dev, vs_str.buf, vs_str.len, NULL, + &pass_p->vs)); + + D3D(ID3D11Device_CreatePixelShader(p->dev, ps_str.buf, ps_str.len, NULL, + &pass_p->ps)); + + in_descs = pl_calloc_ptr(pass, params->num_vertex_attribs, in_descs); + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct pl_vertex_attrib *va = ¶ms->vertex_attribs[i]; + + in_descs[i] = (D3D11_INPUT_ELEMENT_DESC) { + // The semantic name doesn't mean much and is just used to verify + // the input description matches the shader. SPIRV-Cross always + // uses TEXCOORD, so we should too. + .SemanticName = "TEXCOORD", + .SemanticIndex = va->location, + .AlignedByteOffset = va->offset, + .Format = fmt_to_dxgi(va->fmt), + }; + } + D3D(ID3D11Device_CreateInputLayout(p->dev, in_descs, + params->num_vertex_attribs, vs_str.buf, vs_str.len, &pass_p->layout)); + + static const D3D11_BLEND blend_options[] = { + [PL_BLEND_ZERO] = D3D11_BLEND_ZERO, + [PL_BLEND_ONE] = D3D11_BLEND_ONE, + [PL_BLEND_SRC_ALPHA] = D3D11_BLEND_SRC_ALPHA, + [PL_BLEND_ONE_MINUS_SRC_ALPHA] = D3D11_BLEND_INV_SRC_ALPHA, + }; + + D3D11_BLEND_DESC bdesc = { + .RenderTarget[0] = { + .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL, + }, + }; + if (params->blend_params) { + bdesc.RenderTarget[0] = (D3D11_RENDER_TARGET_BLEND_DESC) { + .BlendEnable = TRUE, + .SrcBlend = blend_options[params->blend_params->src_rgb], + .DestBlend = blend_options[params->blend_params->dst_rgb], + .BlendOp = D3D11_BLEND_OP_ADD, + .SrcBlendAlpha = blend_options[params->blend_params->src_alpha], + .DestBlendAlpha = blend_options[params->blend_params->dst_alpha], + .BlendOpAlpha = D3D11_BLEND_OP_ADD, + .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL, + }; + } + D3D(ID3D11Device_CreateBlendState(p->dev, &bdesc, &pass_p->bstate)); + + d3d11_update_program_cache(gpu, pass, obj.key, sig, &vs_str, &ps_str, NULL); + + success = true; +error: + SAFE_RELEASE(vs_blob); + SAFE_RELEASE(ps_blob); + pl_cache_obj_free(&obj); + pl_free(in_descs); + return success; +} + +static bool pass_create_compute(pl_gpu gpu, struct pl_pass_t *pass, + const struct pl_pass_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + ID3DBlob *cs_blob = NULL; + pl_str cs_str = {0}; + pl_cache_obj obj = {0}; + uint64_t sig = 0; + bool success = false; + + if (d3d11_use_cached_program(gpu, pass, params, &obj, &sig, NULL, NULL, &cs_str)) + PL_DEBUG(gpu, "Using cached DXBC shader"); + + if (cs_str.len == 0) { + cs_blob = shader_compile_glsl(gpu, pass, &pass_p->main, + GLSL_SHADER_COMPUTE, params->glsl_shader); + if (!cs_blob) + goto error; + + cs_str = (pl_str) { + .buf = ID3D10Blob_GetBufferPointer(cs_blob), + .len = ID3D10Blob_GetBufferSize(cs_blob), + }; + } + + D3D(ID3D11Device_CreateComputeShader(p->dev, cs_str.buf, cs_str.len, NULL, + &pass_p->cs)); + + if (pass_p->num_workgroups_used) { + D3D11_BUFFER_DESC bdesc = { + .BindFlags = D3D11_BIND_CONSTANT_BUFFER, + .ByteWidth = sizeof(pass_p->last_num_wgs), + }; + D3D(ID3D11Device_CreateBuffer(p->dev, &bdesc, NULL, + &pass_p->num_workgroups_buf)); + } + + d3d11_update_program_cache(gpu, pass, obj.key, sig, NULL, NULL, &cs_str); + + success = true; +error: + pl_cache_obj_free(&obj); + SAFE_RELEASE(cs_blob); + return success; +} + +const struct pl_pass_t *pl_d3d11_pass_create(pl_gpu gpu, + const struct pl_pass_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + + struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_d3d11); + pass->params = pl_pass_params_copy(pass, params); + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + *pass_p = (struct pl_pass_d3d11) { + .max_binding = -1, + }; + + if (params->type == PL_PASS_COMPUTE) { + if (!pass_create_compute(gpu, pass, params)) + goto error; + } else { + if (!pass_create_raster(gpu, pass, params)) + goto error; + } + + // Pre-allocate resource arrays to use in pl_pass_run + pass_p->cbv_arr = pl_calloc(pass, + PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num), + sizeof(*pass_p->cbv_arr)); + pass_p->srv_arr = pl_calloc(pass, + PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num), + sizeof(*pass_p->srv_arr)); + pass_p->sampler_arr = pl_calloc(pass, + PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num), + sizeof(*pass_p->sampler_arr)); + pass_p->uav_arr = pl_calloc(pass, pass_p->uavs.num, sizeof(*pass_p->uav_arr)); + + // Find the highest binding number used in `params->descriptors` if we + // haven't found it already. (If the shader was compiled fresh rather than + // loaded from cache, `pass_p->max_binding` should already be set.) + if (pass_p->max_binding == -1) { + for (int i = 0; i < params->num_descriptors; i++) { + pass_p->max_binding = PL_MAX(pass_p->max_binding, + params->descriptors[i].binding); + } + } + + // Build a mapping from binding numbers to descriptor array indexes + int *binding_map = pl_calloc_ptr(pass, pass_p->max_binding + 1, binding_map); + for (int i = 0; i <= pass_p->max_binding; i++) + binding_map[i] = HLSL_BINDING_NOT_USED; + for (int i = 0; i < params->num_descriptors; i++) + binding_map[params->descriptors[i].binding] = i; + +#define MAP_RESOURCES(array) \ + do { \ + for (int i = 0; i < array.num; i++) { \ + if (array.elem[i] > pass_p->max_binding) { \ + array.elem[i] = HLSL_BINDING_NOT_USED; \ + } else if (array.elem[i] >= 0) { \ + array.elem[i] = binding_map[array.elem[i]]; \ + } \ + } \ + } while (0) + + // During shader compilation (or after loading a compiled shader from cache) + // the entries of the following resource lists are shader binding numbers, + // however, it's more efficient for `pl_pass_run` if they refer to indexes + // of the `params->descriptors` array instead, so remap them here + MAP_RESOURCES(pass_p->main.cbvs); + MAP_RESOURCES(pass_p->main.samplers); + MAP_RESOURCES(pass_p->main.srvs); + MAP_RESOURCES(pass_p->vertex.cbvs); + MAP_RESOURCES(pass_p->vertex.samplers); + MAP_RESOURCES(pass_p->vertex.srvs); + MAP_RESOURCES(pass_p->uavs); + pl_free(binding_map); + + pl_d3d11_flush_message_queue(ctx, "After pass create"); + + return pass; + +error: + pl_d3d11_pass_destroy(gpu, pass); + return NULL; +} + +// Shared logic between VS, PS and CS for filling the resource arrays that are +// passed to ID3D11DeviceContext methods +static void fill_resources(pl_gpu gpu, pl_pass pass, + struct d3d_pass_stage *pass_s, + const struct pl_pass_run_params *params, + ID3D11Buffer **cbvs, ID3D11ShaderResourceView **srvs, + ID3D11SamplerState **samplers) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + + for (int i = 0; i < pass_s->cbvs.num; i++) { + int binding = pass_s->cbvs.elem[i]; + if (binding == HLSL_BINDING_NUM_WORKGROUPS) { + cbvs[i] = pass_p->num_workgroups_buf; + continue; + } else if (binding < 0) { + cbvs[i] = NULL; + continue; + } + + pl_buf buf = params->desc_bindings[binding].object; + pl_d3d11_buf_resolve(gpu, buf); + struct pl_buf_d3d11 *buf_p = PL_PRIV(buf); + cbvs[i] = buf_p->buf; + } + + for (int i = 0; i < pass_s->srvs.num; i++) { + int binding = pass_s->srvs.elem[i]; + if (binding < 0) { + srvs[i] = NULL; + continue; + } + + pl_tex tex; + struct pl_tex_d3d11 *tex_p; + pl_buf buf; + struct pl_buf_d3d11 *buf_p; + switch (pass->params.descriptors[binding].type) { + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: + tex = params->desc_bindings[binding].object; + tex_p = PL_PRIV(tex); + srvs[i] = tex_p->srv; + break; + case PL_DESC_BUF_STORAGE: + buf = params->desc_bindings[binding].object; + buf_p = PL_PRIV(buf); + srvs[i] = buf_p->raw_srv; + break; + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + buf = params->desc_bindings[binding].object; + buf_p = PL_PRIV(buf); + srvs[i] = buf_p->texel_srv; + break; + default: + break; + } + } + + for (int i = 0; i < pass_s->samplers.num; i++) { + int binding = pass_s->samplers.elem[i]; + if (binding < 0) { + samplers[i] = NULL; + continue; + } + + struct pl_desc_binding *db = ¶ms->desc_bindings[binding]; + samplers[i] = p->samplers[db->sample_mode][db->address_mode]; + } +} + +static void fill_uavs(pl_pass pass, const struct pl_pass_run_params *params, + ID3D11UnorderedAccessView **uavs) +{ + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + + for (int i = 0; i < pass_p->uavs.num; i++) { + int binding = pass_p->uavs.elem[i]; + if (binding < 0) { + uavs[i] = NULL; + continue; + } + + pl_tex tex; + struct pl_tex_d3d11 *tex_p; + pl_buf buf; + struct pl_buf_d3d11 *buf_p; + switch (pass->params.descriptors[binding].type) { + case PL_DESC_BUF_STORAGE: + buf = params->desc_bindings[binding].object; + buf_p = PL_PRIV(buf); + uavs[i] = buf_p->raw_uav; + break; + case PL_DESC_STORAGE_IMG: + tex = params->desc_bindings[binding].object; + tex_p = PL_PRIV(tex); + uavs[i] = tex_p->uav; + break; + case PL_DESC_BUF_TEXEL_STORAGE: + buf = params->desc_bindings[binding].object; + buf_p = PL_PRIV(buf); + uavs[i] = buf_p->texel_uav; + break; + default: + break; + } + } +} + +static void pass_run_raster(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + pl_pass pass = params->pass; + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + + if (p->fl <= D3D_FEATURE_LEVEL_9_3 && params->index_buf) { + // Index buffers are unsupported because we can't tell if they are an + // index buffer or a vertex buffer on creation, and FL9_x allows only + // one binding type per-buffer + PL_ERR(gpu, "Index buffers are unsupported in FL9_x"); + return; + } + + if (p->fl <= D3D_FEATURE_LEVEL_9_1 && params->index_data && + params->index_fmt != PL_INDEX_UINT16) + { + PL_ERR(gpu, "32-bit index format is unsupported in FL9_1"); + return; + } + + // Figure out how much vertex/index data to upload, if any + size_t vertex_alloc = params->vertex_data ? pl_vertex_buf_size(params) : 0; + size_t index_alloc = params->index_data ? pl_index_buf_size(params) : 0; + + static const DXGI_FORMAT index_fmts[PL_INDEX_FORMAT_COUNT] = { + [PL_INDEX_UINT16] = DXGI_FORMAT_R16_UINT, + [PL_INDEX_UINT32] = DXGI_FORMAT_R32_UINT, + }; + + // Upload vertex data. On >=FL10_0 we use the same buffer for index data, so + // upload that too. + bool share_vertex_index_buf = p->fl > D3D_FEATURE_LEVEL_9_3; + if (vertex_alloc || (share_vertex_index_buf && index_alloc)) { + struct stream_buf_slice slices[] = { + { .data = params->vertex_data, .size = vertex_alloc }, + { .data = params->index_data, .size = index_alloc }, + }; + + if (!stream_buf_upload(gpu, &p->vbuf, slices, + share_vertex_index_buf ? 2 : 1)) { + PL_ERR(gpu, "Failed to upload vertex data"); + return; + } + + if (vertex_alloc) { + ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &p->vbuf.buf, + &(UINT) { pass->params.vertex_stride }, &slices[0].offset); + } + if (share_vertex_index_buf && index_alloc) { + ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->vbuf.buf, + index_fmts[params->index_fmt], slices[1].offset); + } + } + + // Upload index data for <=FL9_3, which must be in its own buffer + if (!share_vertex_index_buf && index_alloc) { + struct stream_buf_slice slices[] = { + { .data = params->index_data, .size = index_alloc }, + }; + + if (!stream_buf_upload(gpu, &p->ibuf, slices, PL_ARRAY_SIZE(slices))) { + PL_ERR(gpu, "Failed to upload index data"); + return; + } + + ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->ibuf.buf, + index_fmts[params->index_fmt], slices[0].offset); + } + + if (params->vertex_buf) { + struct pl_buf_d3d11 *buf_p = PL_PRIV(params->vertex_buf); + ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &buf_p->buf, + &(UINT) { pass->params.vertex_stride }, + &(UINT) { params->buf_offset }); + } + + if (params->index_buf) { + struct pl_buf_d3d11 *buf_p = PL_PRIV(params->index_buf); + ID3D11DeviceContext_IASetIndexBuffer(p->imm, buf_p->buf, + index_fmts[params->index_fmt], params->index_offset); + } + + ID3D11DeviceContext_IASetInputLayout(p->imm, pass_p->layout); + + static const D3D_PRIMITIVE_TOPOLOGY prim_topology[] = { + [PL_PRIM_TRIANGLE_LIST] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, + [PL_PRIM_TRIANGLE_STRIP] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP, + }; + ID3D11DeviceContext_IASetPrimitiveTopology(p->imm, + prim_topology[pass->params.vertex_type]); + + ID3D11DeviceContext_VSSetShader(p->imm, pass_p->vs, NULL, 0); + + ID3D11Buffer **cbvs = pass_p->cbv_arr; + ID3D11ShaderResourceView **srvs = pass_p->srv_arr; + ID3D11SamplerState **samplers = pass_p->sampler_arr; + ID3D11UnorderedAccessView **uavs = pass_p->uav_arr; + + // Set vertex shader resources. The device context is called conditionally + // because the debug layer complains if these are called with 0 resources. + fill_resources(gpu, pass, &pass_p->vertex, params, cbvs, srvs, samplers); + if (pass_p->vertex.cbvs.num) + ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs); + if (pass_p->vertex.srvs.num) + ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs); + if (pass_p->vertex.samplers.num) + ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers); + + ID3D11DeviceContext_RSSetState(p->imm, p->rstate); + ID3D11DeviceContext_RSSetViewports(p->imm, 1, (&(D3D11_VIEWPORT) { + .TopLeftX = params->viewport.x0, + .TopLeftY = params->viewport.y0, + .Width = pl_rect_w(params->viewport), + .Height = pl_rect_h(params->viewport), + .MinDepth = 0, + .MaxDepth = 1, + })); + ID3D11DeviceContext_RSSetScissorRects(p->imm, 1, (&(D3D11_RECT) { + .left = params->scissors.x0, + .top = params->scissors.y0, + .right = params->scissors.x1, + .bottom = params->scissors.y1, + })); + + ID3D11DeviceContext_PSSetShader(p->imm, pass_p->ps, NULL, 0); + + // Set pixel shader resources + fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers); + if (pass_p->main.cbvs.num) + ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs); + if (pass_p->main.srvs.num) + ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs); + if (pass_p->main.samplers.num) + ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers); + + ID3D11DeviceContext_OMSetBlendState(p->imm, pass_p->bstate, NULL, + D3D11_DEFAULT_SAMPLE_MASK); + ID3D11DeviceContext_OMSetDepthStencilState(p->imm, p->dsstate, 0); + + fill_uavs(pass, params, uavs); + + struct pl_tex_d3d11 *target_p = PL_PRIV(params->target); + ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews( + p->imm, 1, &target_p->rtv, NULL, 1, pass_p->uavs.num, uavs, NULL); + + if (params->index_data || params->index_buf) { + ID3D11DeviceContext_DrawIndexed(p->imm, params->vertex_count, 0, 0); + } else { + ID3D11DeviceContext_Draw(p->imm, params->vertex_count, 0); + } + + // Unbind everything. It's easier to do this than to actually track state, + // and if we leave the RTV bound, it could trip up D3D's conflict checker. + // Also, apparently unbinding SRVs can prevent a 10level9 bug? + // https://docs.microsoft.com/en-us/windows/win32/direct3d11/overviews-direct3d-11-devices-downlevel-prevent-null-srvs + for (int i = 0; i < PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num); i++) + cbvs[i] = NULL; + for (int i = 0; i < PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num); i++) + srvs[i] = NULL; + for (int i = 0; i < PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num); i++) + samplers[i] = NULL; + for (int i = 0; i < pass_p->uavs.num; i++) + uavs[i] = NULL; + if (pass_p->vertex.cbvs.num) + ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs); + if (pass_p->vertex.srvs.num) + ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs); + if (pass_p->vertex.samplers.num) + ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers); + if (pass_p->main.cbvs.num) + ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs); + if (pass_p->main.srvs.num) + ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs); + if (pass_p->main.samplers.num) + ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers); + ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews( + p->imm, 0, NULL, NULL, 1, pass_p->uavs.num, uavs, NULL); +} + +static void pass_run_compute(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + pl_pass pass = params->pass; + struct pl_pass_d3d11 *pass_p = PL_PRIV(pass); + + // Update gl_NumWorkGroups emulation buffer if necessary + if (pass_p->num_workgroups_used) { + bool needs_update = false; + for (int i = 0; i < 3; i++) { + if (pass_p->last_num_wgs.num_wgs[i] != params->compute_groups[i]) + needs_update = true; + pass_p->last_num_wgs.num_wgs[i] = params->compute_groups[i]; + } + + if (needs_update) { + ID3D11DeviceContext_UpdateSubresource(p->imm, + (ID3D11Resource *) pass_p->num_workgroups_buf, 0, NULL, + &pass_p->last_num_wgs, 0, 0); + } + } + + ID3D11DeviceContext_CSSetShader(p->imm, pass_p->cs, NULL, 0); + + ID3D11Buffer **cbvs = pass_p->cbv_arr; + ID3D11ShaderResourceView **srvs = pass_p->srv_arr; + ID3D11UnorderedAccessView **uavs = pass_p->uav_arr; + ID3D11SamplerState **samplers = pass_p->sampler_arr; + + fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers); + fill_uavs(pass, params, uavs); + + if (pass_p->main.cbvs.num) + ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs); + if (pass_p->main.srvs.num) + ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs); + if (pass_p->main.samplers.num) + ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers); + if (pass_p->uavs.num) + ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL); + + ID3D11DeviceContext_Dispatch(p->imm, params->compute_groups[0], + params->compute_groups[1], + params->compute_groups[2]); + + // Unbind everything + for (int i = 0; i < pass_p->main.cbvs.num; i++) + cbvs[i] = NULL; + for (int i = 0; i < pass_p->main.srvs.num; i++) + srvs[i] = NULL; + for (int i = 0; i < pass_p->main.samplers.num; i++) + samplers[i] = NULL; + for (int i = 0; i < pass_p->uavs.num; i++) + uavs[i] = NULL; + if (pass_p->main.cbvs.num) + ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs); + if (pass_p->main.srvs.num) + ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs); + if (pass_p->main.samplers.num) + ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers); + if (pass_p->uavs.num) + ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL); +} + +void pl_d3d11_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + pl_pass pass = params->pass; + + pl_d3d11_timer_start(gpu, params->timer); + + if (pass->params.type == PL_PASS_COMPUTE) { + pass_run_compute(gpu, params); + } else { + pass_run_raster(gpu, params); + } + + pl_d3d11_timer_end(gpu, params->timer); + pl_d3d11_flush_message_queue(ctx, "After pass run"); +} diff --git a/src/d3d11/gpu_tex.c b/src/d3d11/gpu_tex.c new file mode 100644 index 0000000..d63fc17 --- /dev/null +++ b/src/d3d11/gpu_tex.c @@ -0,0 +1,745 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "formats.h" + +static inline UINT tex_subresource(pl_tex tex) +{ + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + return tex_p->array_slice >= 0 ? tex_p->array_slice : 0; +} + +static bool tex_init(pl_gpu gpu, pl_tex tex) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + + // View formats may be omitted when they match the texture format, but for + // simplicity's sake we always set it. It will match the texture format for + // textures created with tex_create, but it can be different for video + // textures wrapped with pl_d3d11_wrap. + DXGI_FORMAT fmt = fmt_to_dxgi(tex->params.format); + + if (tex->params.sampleable || tex->params.storable) { + D3D11_SHADER_RESOURCE_VIEW_DESC srvdesc = { + .Format = fmt, + }; + switch (pl_tex_params_dimension(tex->params)) { + case 1: + if (tex_p->array_slice >= 0) { + srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1DARRAY; + srvdesc.Texture1DArray.MipLevels = 1; + srvdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice; + srvdesc.Texture1DArray.ArraySize = 1; + } else { + srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; + srvdesc.Texture1D.MipLevels = 1; + } + break; + case 2: + if (tex_p->array_slice >= 0) { + srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2DARRAY; + srvdesc.Texture2DArray.MipLevels = 1; + srvdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice; + srvdesc.Texture2DArray.ArraySize = 1; + } else { + srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srvdesc.Texture2D.MipLevels = 1; + } + break; + case 3: + // D3D11 does not have Texture3D arrays + srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; + srvdesc.Texture3D.MipLevels = 1; + break; + } + D3D(ID3D11Device_CreateShaderResourceView(p->dev, tex_p->res, &srvdesc, + &tex_p->srv)); + } + + if (tex->params.renderable) { + D3D11_RENDER_TARGET_VIEW_DESC rtvdesc = { + .Format = fmt, + }; + switch (pl_tex_params_dimension(tex->params)) { + case 1: + if (tex_p->array_slice >= 0) { + rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE1DARRAY; + rtvdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice; + rtvdesc.Texture1DArray.ArraySize = 1; + } else { + rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE1D; + } + break; + case 2: + if (tex_p->array_slice >= 0) { + rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2DARRAY; + rtvdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice; + rtvdesc.Texture2DArray.ArraySize = 1; + } else { + rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; + } + break; + case 3: + // D3D11 does not have Texture3D arrays + rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE3D; + rtvdesc.Texture3D.WSize = -1; + break; + } + D3D(ID3D11Device_CreateRenderTargetView(p->dev, tex_p->res, &rtvdesc, + &tex_p->rtv)); + } + + if (p->fl >= D3D_FEATURE_LEVEL_11_0 && tex->params.storable) { + D3D11_UNORDERED_ACCESS_VIEW_DESC uavdesc = { + .Format = fmt, + }; + switch (pl_tex_params_dimension(tex->params)) { + case 1: + if (tex_p->array_slice >= 0) { + uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE1DARRAY; + uavdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice; + uavdesc.Texture1DArray.ArraySize = 1; + } else { + uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE1D; + } + break; + case 2: + if (tex_p->array_slice >= 0) { + uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2DARRAY; + uavdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice; + uavdesc.Texture2DArray.ArraySize = 1; + } else { + uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D; + } + break; + case 3: + // D3D11 does not have Texture3D arrays + uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE3D; + uavdesc.Texture3D.WSize = -1; + break; + } + D3D(ID3D11Device_CreateUnorderedAccessView(p->dev, tex_p->res, &uavdesc, + &tex_p->uav)); + } + + return true; +error: + return false; +} + +void pl_d3d11_tex_destroy(pl_gpu gpu, pl_tex tex) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + + SAFE_RELEASE(tex_p->srv); + SAFE_RELEASE(tex_p->rtv); + SAFE_RELEASE(tex_p->uav); + SAFE_RELEASE(tex_p->res); + SAFE_RELEASE(tex_p->staging); + + pl_d3d11_flush_message_queue(ctx, "After texture destroy"); + + pl_free((void *) tex); +} + +pl_tex pl_d3d11_tex_create(pl_gpu gpu, const struct pl_tex_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_d3d11); + tex->params = *params; + tex->params.initial_data = NULL; + tex->sampler_type = PL_SAMPLER_NORMAL; + + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + + DXGI_FORMAT dxfmt = fmt_to_dxgi(params->format); + + D3D11_USAGE usage = D3D11_USAGE_DEFAULT; + D3D11_BIND_FLAG bind_flags = 0; + + if (params->format->emulated) { + tex_p->texel_fmt = pl_find_fmt(gpu, params->format->type, 1, 0, + params->format->host_bits[0], + PL_FMT_CAP_TEXEL_UNIFORM); + + if (!tex_p->texel_fmt) { + PL_ERR(gpu, "Failed picking texel format for emulated texture!"); + goto error; + } + + tex->params.storable = true; + } + + if (p->fl >= D3D_FEATURE_LEVEL_11_0) { + // On >=FL11_0, blit emulation needs image storage + tex->params.storable |= params->blit_src || params->blit_dst; + + // Blit emulation can use a sampler for linear filtering during stretch + if ((tex->params.format->caps & PL_FMT_CAP_LINEAR) && params->blit_src) + tex->params.sampleable = true; + } else { + // On <FL11_0, blit emulation uses a render pass + tex->params.sampleable |= params->blit_src; + tex->params.renderable |= params->blit_dst; + } + + if (tex->params.sampleable) + bind_flags |= D3D11_BIND_SHADER_RESOURCE; + if (tex->params.renderable) + bind_flags |= D3D11_BIND_RENDER_TARGET; + if (p->fl >= D3D_FEATURE_LEVEL_11_0 && tex->params.storable) + bind_flags |= D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS; + + // Apparently IMMUTABLE textures are efficient, so try to infer whether we + // can use one + if (params->initial_data && !params->format->emulated && + !tex->params.renderable && !tex->params.storable && !params->host_writable) + { + usage = D3D11_USAGE_IMMUTABLE; + } + + // In FL9_x, resources with only D3D11_BIND_SHADER_RESOURCE can't be copied + // from GPU-accessible memory to CPU-accessible memory. The only other bind + // flag we set on this FL is D3D11_BIND_RENDER_TARGET, so set it. + if (p->fl <= D3D_FEATURE_LEVEL_9_3 && tex->params.host_readable) + bind_flags |= D3D11_BIND_RENDER_TARGET; + + // In FL9_x, when using DEFAULT or IMMUTABLE, BindFlags cannot be zero + if (p->fl <= D3D_FEATURE_LEVEL_9_3 && !bind_flags) + bind_flags |= D3D11_BIND_SHADER_RESOURCE; + + D3D11_SUBRESOURCE_DATA data; + D3D11_SUBRESOURCE_DATA *pdata = NULL; + if (params->initial_data && !params->format->emulated) { + data = (D3D11_SUBRESOURCE_DATA) { + .pSysMem = params->initial_data, + .SysMemPitch = params->w * params->format->texel_size, + }; + if (params->d) + data.SysMemSlicePitch = data.SysMemPitch * params->h; + pdata = &data; + } + + switch (pl_tex_params_dimension(*params)) { + case 1:; + D3D11_TEXTURE1D_DESC desc1d = { + .Width = params->w, + .MipLevels = 1, + .ArraySize = 1, + .Format = dxfmt, + .Usage = usage, + .BindFlags = bind_flags, + }; + D3D(ID3D11Device_CreateTexture1D(p->dev, &desc1d, pdata, &tex_p->tex1d)); + tex_p->res = (ID3D11Resource *)tex_p->tex1d; + + // Create a staging texture with CPU access for pl_tex_download() + if (params->host_readable) { + desc1d.BindFlags = 0; + desc1d.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc1d.Usage = D3D11_USAGE_STAGING; + + D3D(ID3D11Device_CreateTexture1D(p->dev, &desc1d, NULL, + &tex_p->staging1d)); + tex_p->staging = (ID3D11Resource *) tex_p->staging1d; + } + break; + case 2:; + D3D11_TEXTURE2D_DESC desc2d = { + .Width = params->w, + .Height = params->h, + .MipLevels = 1, + .ArraySize = 1, + .SampleDesc.Count = 1, + .Format = dxfmt, + .Usage = usage, + .BindFlags = bind_flags, + }; + D3D(ID3D11Device_CreateTexture2D(p->dev, &desc2d, pdata, &tex_p->tex2d)); + tex_p->res = (ID3D11Resource *)tex_p->tex2d; + + // Create a staging texture with CPU access for pl_tex_download() + if (params->host_readable) { + desc2d.BindFlags = 0; + desc2d.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc2d.Usage = D3D11_USAGE_STAGING; + + D3D(ID3D11Device_CreateTexture2D(p->dev, &desc2d, NULL, + &tex_p->staging2d)); + tex_p->staging = (ID3D11Resource *) tex_p->staging2d; + } + break; + case 3:; + D3D11_TEXTURE3D_DESC desc3d = { + .Width = params->w, + .Height = params->h, + .Depth = params->d, + .MipLevels = 1, + .Format = dxfmt, + .Usage = usage, + .BindFlags = bind_flags, + }; + D3D(ID3D11Device_CreateTexture3D(p->dev, &desc3d, pdata, &tex_p->tex3d)); + tex_p->res = (ID3D11Resource *)tex_p->tex3d; + + // Create a staging texture with CPU access for pl_tex_download() + if (params->host_readable) { + desc3d.BindFlags = 0; + desc3d.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc3d.Usage = D3D11_USAGE_STAGING; + + D3D(ID3D11Device_CreateTexture3D(p->dev, &desc3d, NULL, + &tex_p->staging3d)); + tex_p->staging = (ID3D11Resource *) tex_p->staging3d; + } + break; + default: + pl_unreachable(); + } + + tex_p->array_slice = -1; + + if (!tex_init(gpu, tex)) + goto error; + + if (params->initial_data && params->format->emulated) { + struct pl_tex_transfer_params ul_params = { + .tex = tex, + .ptr = (void *) params->initial_data, + .rc = { 0, 0, 0, params->w, params->h, params->d }, + }; + + // Since we re-use GPU helpers which require writable images, just fake it + bool writable = tex->params.host_writable; + tex->params.host_writable = true; + if (!pl_tex_upload(gpu, &ul_params)) + goto error; + tex->params.host_writable = writable; + } + + pl_d3d11_flush_message_queue(ctx, "After texture create"); + + return tex; + +error: + pl_d3d11_tex_destroy(gpu, tex); + return NULL; +} + +pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_d3d11); + tex->sampler_type = PL_SAMPLER_NORMAL; + + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + + DXGI_FORMAT fmt = DXGI_FORMAT_UNKNOWN; + D3D11_USAGE usage = D3D11_USAGE_DEFAULT; + D3D11_BIND_FLAG bind_flags = 0; + UINT mip_levels = 1; + UINT array_size = 1; + UINT sample_count = 1; + + D3D11_RESOURCE_DIMENSION type; + ID3D11Resource_GetType(params->tex, &type); + + switch (type) { + case D3D11_RESOURCE_DIMENSION_TEXTURE1D: + D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture1D, + (void **) &tex_p->tex1d)); + tex_p->res = (ID3D11Resource *) tex_p->tex1d; + + D3D11_TEXTURE1D_DESC desc1d; + ID3D11Texture1D_GetDesc(tex_p->tex1d, &desc1d); + + tex->params.w = desc1d.Width; + mip_levels = desc1d.MipLevels; + array_size = desc1d.ArraySize; + fmt = desc1d.Format; + usage = desc1d.Usage; + bind_flags = desc1d.BindFlags; + break; + + case D3D11_RESOURCE_DIMENSION_TEXTURE2D: + D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture2D, + (void **) &tex_p->tex2d)); + tex_p->res = (ID3D11Resource *) tex_p->tex2d; + + D3D11_TEXTURE2D_DESC desc2d; + ID3D11Texture2D_GetDesc(tex_p->tex2d, &desc2d); + + tex->params.w = desc2d.Width; + tex->params.h = desc2d.Height; + mip_levels = desc2d.MipLevels; + array_size = desc2d.ArraySize; + fmt = desc2d.Format; + sample_count = desc2d.SampleDesc.Count; + usage = desc2d.Usage; + bind_flags = desc2d.BindFlags; + + // Allow the format and size of 2D textures to be overridden to support + // shader views of video resources + if (params->fmt) { + fmt = params->fmt; + tex->params.w = params->w; + tex->params.h = params->h; + } + + break; + + case D3D11_RESOURCE_DIMENSION_TEXTURE3D: + D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture3D, + (void **) &tex_p->tex3d)); + tex_p->res = (ID3D11Resource *) tex_p->tex3d; + + D3D11_TEXTURE3D_DESC desc3d; + ID3D11Texture3D_GetDesc(tex_p->tex3d, &desc3d); + + tex->params.w = desc3d.Width; + tex->params.h = desc3d.Height; + tex->params.d = desc3d.Depth; + mip_levels = desc3d.MipLevels; + fmt = desc3d.Format; + usage = desc3d.Usage; + bind_flags = desc3d.BindFlags; + break; + + case D3D11_RESOURCE_DIMENSION_UNKNOWN: + case D3D11_RESOURCE_DIMENSION_BUFFER: + PL_ERR(gpu, "Resource is not suitable to wrap"); + goto error; + } + + if (mip_levels != 1) { + PL_ERR(gpu, "Mipmapped textures not supported for wrapping"); + goto error; + } + if (sample_count != 1) { + PL_ERR(gpu, "Multisampled textures not supported for wrapping"); + goto error; + } + if (usage != D3D11_USAGE_DEFAULT) { + PL_ERR(gpu, "Resource is not D3D11_USAGE_DEFAULT"); + goto error; + } + + if (array_size > 1) { + if (params->array_slice < 0 || params->array_slice >= array_size) { + PL_ERR(gpu, "array_slice out of range"); + goto error; + } + tex_p->array_slice = params->array_slice; + } else { + tex_p->array_slice = -1; + } + + if (bind_flags & D3D11_BIND_SHADER_RESOURCE) { + tex->params.sampleable = true; + + // Blit emulation uses a render pass on <FL11_0 + if (p->fl < D3D_FEATURE_LEVEL_11_0) + tex->params.blit_src = true; + } + if (bind_flags & D3D11_BIND_RENDER_TARGET) { + tex->params.renderable = true; + + // Blit emulation uses a render pass on <FL11_0 + if (p->fl < D3D_FEATURE_LEVEL_11_0) + tex->params.blit_dst = true; + } + static const D3D11_BIND_FLAG storable_flags = + D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; + if ((bind_flags & storable_flags) == storable_flags) { + tex->params.storable = true; + + // Blit emulation uses image storage on >=FL11_0. A feature level check + // isn't required because <FL11_0 doesn't have storable images. + tex->params.blit_src = tex->params.blit_dst = true; + } + + for (int i = 0; i < gpu->num_formats; i++) { + DXGI_FORMAT target_fmt = fmt_to_dxgi(gpu->formats[i]); + if (fmt == target_fmt) { + tex->params.format = gpu->formats[i]; + break; + } + } + if (!tex->params.format) { + PL_ERR(gpu, "Could not find a suitable pl_fmt for wrapped resource"); + goto error; + } + + if (!tex_init(gpu, tex)) + goto error; + + pl_d3d11_flush_message_queue(ctx, "After texture wrap"); + + return tex; + +error: + pl_d3d11_tex_destroy(gpu, tex); + return NULL; +} + +void pl_d3d11_tex_invalidate(pl_gpu gpu, pl_tex tex) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + + // Resource discarding requires D3D11.1 + if (!p->imm1) + return; + + // Prefer discarding a view to discarding the whole resource. The reason + // for this is that a pl_tex can refer to a single member of a texture + // array. Discarding the SRV, RTV or UAV should only discard that member. + if (tex_p->rtv) { + ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->rtv); + } else if (tex_p->uav) { + ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->uav); + } else if (tex_p->srv) { + ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->srv); + } else if (tex_p->array_slice < 0) { + // If there are no views, only discard if the ID3D11Resource is not a + // texture array + ID3D11DeviceContext1_DiscardResource(p->imm1, tex_p->res); + } + + pl_d3d11_flush_message_queue(ctx, "After texture invalidate"); +} + +void pl_d3d11_tex_clear_ex(pl_gpu gpu, pl_tex tex, + const union pl_clear_color color) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + + if (tex->params.format->type == PL_FMT_UINT) { + if (tex_p->uav) { + ID3D11DeviceContext_ClearUnorderedAccessViewUint(p->imm, tex_p->uav, + color.u); + } else { + float c[4] = { color.u[0], color.u[1], color.u[2], color.u[3] }; + ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, c); + } + + } else if (tex->params.format->type == PL_FMT_SINT) { + if (tex_p->uav) { + ID3D11DeviceContext_ClearUnorderedAccessViewUint(p->imm, tex_p->uav, + (const uint32_t *)color.i); + } else { + float c[4] = { color.i[0], color.i[1], color.i[2], color.i[3] }; + ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, c); + } + + } else if (tex_p->rtv) { + ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, color.f); + } else { + ID3D11DeviceContext_ClearUnorderedAccessViewFloat(p->imm, tex_p->uav, color.f); + } + + pl_d3d11_flush_message_queue(ctx, "After texture clear"); +} + +#define pl_rect3d_to_box(rc) \ + ((D3D11_BOX) { \ + .left = rc.x0, .top = rc.y0, .front = rc.z0, \ + .right = rc.x1, .bottom = rc.y1, .back = rc.z1, \ + }) + +void pl_d3d11_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + struct pl_tex_d3d11 *src_p = PL_PRIV(params->src); + DXGI_FORMAT src_fmt = fmt_to_dxgi(params->src->params.format); + struct pl_tex_d3d11 *dst_p = PL_PRIV(params->dst); + DXGI_FORMAT dst_fmt = fmt_to_dxgi(params->dst->params.format); + + // If the blit operation doesn't require flipping, scaling or format + // conversion, we can use CopySubresourceRegion + pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc; + if (pl_rect3d_eq(src_rc, dst_rc) && src_fmt == dst_fmt) { + pl_rect3d rc = params->src_rc; + pl_rect3d_normalize(&rc); + + ID3D11DeviceContext_CopySubresourceRegion(p->imm, dst_p->res, + tex_subresource(params->dst), rc.x0, rc.y0, rc.z0, src_p->res, + tex_subresource(params->src), &pl_rect3d_to_box(rc)); + } else if (p->fl >= D3D_FEATURE_LEVEL_11_0) { + if (!pl_tex_blit_compute(gpu, params)) + PL_ERR(gpu, "Failed compute shader fallback blit"); + } else { + pl_tex_blit_raster(gpu, params); + } + + pl_d3d11_flush_message_queue(ctx, "After texture blit"); +} + +bool pl_d3d11_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + struct pl_tex_transfer_params *slices = NULL; + bool ret = false; + + pl_d3d11_timer_start(gpu, params->timer); + + if (fmt->emulated) { + + int num_slices = pl_tex_transfer_slices(gpu, tex_p->texel_fmt, params, &slices); + for (int i = 0; i < num_slices; i++) { + // Copy the source data buffer into an intermediate buffer + pl_buf tbuf = pl_buf_create(gpu, pl_buf_params( + .memory_type = PL_BUF_MEM_DEVICE, + .format = tex_p->texel_fmt, + .size = pl_tex_transfer_size(&slices[i]), + .initial_data = slices[i].ptr, + .storable = true, + )); + + if (!tbuf) { + PL_ERR(gpu, "Failed creating buffer for tex upload fallback!"); + goto error; + } + + slices[i].ptr = NULL; + slices[i].buf = tbuf; + slices[i].buf_offset = 0; + bool ok = pl_tex_upload_texel(gpu, &slices[i]); + pl_buf_destroy(gpu, &tbuf); + if (!ok) + goto error; + } + + } else { + + ID3D11DeviceContext_UpdateSubresource(p->imm, tex_p->res, + tex_subresource(tex), &pl_rect3d_to_box(params->rc), params->ptr, + params->row_pitch, params->depth_pitch); + + } + + ret = true; + +error: + pl_d3d11_timer_end(gpu, params->timer); + pl_d3d11_flush_message_queue(ctx, "After texture upload"); + + pl_free(slices); + return ret; +} + +bool pl_d3d11_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + struct pl_gpu_d3d11 *p = PL_PRIV(gpu); + struct d3d11_ctx *ctx = p->ctx; + const struct pl_tex_t *tex = params->tex; + pl_fmt fmt = tex->params.format; + struct pl_tex_d3d11 *tex_p = PL_PRIV(tex); + struct pl_tex_transfer_params *slices = NULL; + bool ret = false; + + if (!tex_p->staging) + return false; + + pl_d3d11_timer_start(gpu, params->timer); + + if (fmt->emulated) { + + pl_buf tbuf = NULL; + int num_slices = pl_tex_transfer_slices(gpu, tex_p->texel_fmt, params, &slices); + for (int i = 0; i < num_slices; i++) { + const size_t slice_size = pl_tex_transfer_size(&slices[i]); + bool ok = pl_buf_recreate(gpu, &tbuf, pl_buf_params( + .storable = true, + .size = slice_size, + .memory_type = PL_BUF_MEM_DEVICE, + .format = tex_p->texel_fmt, + .host_readable = true, + )); + + if (!ok) { + PL_ERR(gpu, "Failed creating buffer for tex download fallback!"); + goto error; + } + + void *ptr = slices[i].ptr; + slices[i].ptr = NULL; + slices[i].buf = tbuf; + slices[i].buf_offset = 0; + + // Download into an intermediate buffer first + ok = pl_tex_download_texel(gpu, &slices[i]); + ok = ok && pl_buf_read(gpu, tbuf, 0, ptr, slice_size); + if (!ok) { + pl_buf_destroy(gpu, &tbuf); + goto error; + } + } + pl_buf_destroy(gpu, &tbuf); + + } else { + + ID3D11DeviceContext_CopySubresourceRegion(p->imm, + (ID3D11Resource *) tex_p->staging, 0, params->rc.x0, params->rc.y0, + params->rc.z0, tex_p->res, tex_subresource(tex), + &pl_rect3d_to_box(params->rc)); + + D3D11_MAPPED_SUBRESOURCE lock; + D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) tex_p->staging, 0, + D3D11_MAP_READ, 0, &lock)); + + char *cdst = params->ptr; + char *csrc = lock.pData; + size_t line_size = pl_rect_w(params->rc) * tex->params.format->texel_size; + for (int z = 0; z < pl_rect_d(params->rc); z++) { + for (int y = 0; y < pl_rect_h(params->rc); y++) { + memcpy(cdst + z * params->depth_pitch + y * params->row_pitch, + csrc + (params->rc.z0 + z) * lock.DepthPitch + + (params->rc.y0 + y) * lock.RowPitch + params->rc.x0, + line_size); + } + } + + ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource*)tex_p->staging, 0); + } + + ret = true; + +error: + pl_d3d11_timer_end(gpu, params->timer); + pl_d3d11_flush_message_queue(ctx, "After texture download"); + + pl_free(slices); + return ret; +} diff --git a/src/d3d11/meson.build b/src/d3d11/meson.build new file mode 100644 index 0000000..d4c4b44 --- /dev/null +++ b/src/d3d11/meson.build @@ -0,0 +1,41 @@ +d3d11 = get_option('d3d11') +d3d11_header = cc.check_header('d3d11.h', required: false) # needed publicly +d3d11_headers_extra = [ # needed internally + cc.check_header('d3d11_4.h', required: d3d11), + cc.check_header('dxgi1_6.h', required: d3d11), +] +d3d11_deps = [ + dependency('spirv-cross-c-shared', version: '>=0.29.0', required: d3d11), + cc.find_library('version', required: d3d11), +] + +d3d11 = d3d11.require(d3d11_header) +foreach h : d3d11_headers_extra + d3d11 = d3d11.require(h) +endforeach +foreach d : d3d11_deps + d3d11 = d3d11.require(d.found()) +endforeach + +components.set('d3d11', d3d11.allowed()) +if d3d11.allowed() + conf_internal.set('PL_HAVE_DXGI_DEBUG', + cc.has_header_symbol('dxgidebug.h', 'IID_IDXGIInfoQueue')) + conf_internal.set('PL_HAVE_DXGI_DEBUG_D3D11', + cc.has_header_symbol('d3d11sdklayers.h', 'DXGI_DEBUG_D3D11')) + add_project_arguments(['-DCOBJMACROS'], language: ['c', 'cpp']) + build_deps += declare_dependency(dependencies: d3d11_deps) + tests += 'd3d11.c' + sources += [ + 'd3d11/context.c', + 'd3d11/formats.c', + 'd3d11/gpu.c', + 'd3d11/gpu_buf.c', + 'd3d11/gpu_tex.c', + 'd3d11/gpu_pass.c', + 'd3d11/swapchain.c', + 'd3d11/utils.c', + ] +elif d3d11_header + sources += 'd3d11/stubs.c' +endif diff --git a/src/d3d11/stubs.c b/src/d3d11/stubs.c new file mode 100644 index 0000000..b3f259c --- /dev/null +++ b/src/d3d11/stubs.c @@ -0,0 +1,56 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "../common.h" +#include "log.h" + +#include <libplacebo/d3d11.h> + +const struct pl_d3d11_params pl_d3d11_default_params = { PL_D3D11_DEFAULTS }; + +pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params) +{ + pl_fatal(log, "libplacebo compiled without D3D11 support!"); + return NULL; +} + +void pl_d3d11_destroy(pl_d3d11 *pd3d11) +{ + pl_d3d11 d3d11 = *pd3d11; + pl_assert(!d3d11); +} + +pl_d3d11 pl_d3d11_get(pl_gpu gpu) +{ + return NULL; +} + +pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11, + const struct pl_d3d11_swapchain_params *params) +{ + pl_unreachable(); +} + +IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw) +{ + pl_unreachable(); +} + +pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params) +{ + pl_unreachable(); +} diff --git a/src/d3d11/swapchain.c b/src/d3d11/swapchain.c new file mode 100644 index 0000000..8a53632 --- /dev/null +++ b/src/d3d11/swapchain.c @@ -0,0 +1,667 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <windows.h> +#include <versionhelpers.h> +#include <math.h> + +#include "gpu.h" +#include "swapchain.h" +#include "utils.h" + +struct d3d11_csp_mapping { + DXGI_COLOR_SPACE_TYPE d3d11_csp; + DXGI_FORMAT d3d11_fmt; + struct pl_color_space out_csp; +}; + +static struct d3d11_csp_mapping map_pl_csp_to_d3d11(const struct pl_color_space *hint, + bool use_8bit_sdr) +{ + if (pl_color_space_is_hdr(hint) && + hint->transfer != PL_COLOR_TRC_LINEAR) + { + struct pl_color_space pl_csp = pl_color_space_hdr10; + pl_csp.hdr = (struct pl_hdr_metadata) { + // Whitelist only values that we support signalling metadata for + .prim = hint->hdr.prim, + .min_luma = hint->hdr.min_luma, + .max_luma = hint->hdr.max_luma, + .max_cll = hint->hdr.max_cll, + .max_fall = hint->hdr.max_fall, + }; + + return (struct d3d11_csp_mapping){ + .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, + .d3d11_fmt = DXGI_FORMAT_R10G10B10A2_UNORM, + .out_csp = pl_csp, + }; + } else if (pl_color_primaries_is_wide_gamut(hint->primaries) || + hint->transfer == PL_COLOR_TRC_LINEAR) + { + // scRGB a la VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT, + // so could be utilized for HDR/wide gamut content as well + // with content that goes beyond 0.0-1.0. + return (struct d3d11_csp_mapping){ + .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709, + .d3d11_fmt = DXGI_FORMAT_R16G16B16A16_FLOAT, + .out_csp = { + .primaries = PL_COLOR_PRIM_BT_709, + .transfer = PL_COLOR_TRC_LINEAR, + } + }; + } + + return (struct d3d11_csp_mapping){ + .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709, + .d3d11_fmt = use_8bit_sdr ? DXGI_FORMAT_R8G8B8A8_UNORM : + DXGI_FORMAT_R10G10B10A2_UNORM, + .out_csp = pl_color_space_monitor, + }; +} + +struct priv { + struct pl_sw_fns impl; + + struct d3d11_ctx *ctx; + IDXGISwapChain *swapchain; + pl_tex backbuffer; + + // Currently requested or applied swap chain configuration. + // Affected by received colorspace hints. + struct d3d11_csp_mapping csp_map; + + // Whether a swapchain backbuffer format reconfiguration has been + // requested by means of an additional resize action. + bool update_swapchain_format; + + // Whether 10-bit backbuffer format is disabled for SDR content. + bool disable_10bit_sdr; + + // Fallback to 8-bit RGB was triggered due to lack of compatiblity + bool fallback_8bit_rgb; +}; + +static void d3d11_sw_destroy(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + + pl_tex_destroy(sw->gpu, &p->backbuffer); + SAFE_RELEASE(p->swapchain); + pl_free((void *) sw); +} + +static int d3d11_sw_latency(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + struct d3d11_ctx *ctx = p->ctx; + + UINT max_latency; + IDXGIDevice1_GetMaximumFrameLatency(ctx->dxgi_dev, &max_latency); + return max_latency; +} + +static pl_tex get_backbuffer(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + struct d3d11_ctx *ctx = p->ctx; + ID3D11Texture2D *backbuffer = NULL; + pl_tex tex = NULL; + + D3D(IDXGISwapChain_GetBuffer(p->swapchain, 0, &IID_ID3D11Texture2D, + (void **) &backbuffer)); + + tex = pl_d3d11_wrap(sw->gpu, pl_d3d11_wrap_params( + .tex = (ID3D11Resource *) backbuffer, + )); + +error: + SAFE_RELEASE(backbuffer); + return tex; +} + +static bool d3d11_sw_resize(pl_swapchain sw, int *width, int *height) +{ + struct priv *p = PL_PRIV(sw); + struct d3d11_ctx *ctx = p->ctx; + + DXGI_SWAP_CHAIN_DESC desc = {0}; + IDXGISwapChain_GetDesc(p->swapchain, &desc); + int w = PL_DEF(*width, desc.BufferDesc.Width); + int h = PL_DEF(*height, desc.BufferDesc.Height); + bool format_changed = p->csp_map.d3d11_fmt != desc.BufferDesc.Format; + if (format_changed) { + PL_INFO(ctx, "Attempting to reconfigure swap chain format: %s -> %s", + pl_get_dxgi_format_name(desc.BufferDesc.Format), + pl_get_dxgi_format_name(p->csp_map.d3d11_fmt)); + } + + if (w != desc.BufferDesc.Width || h != desc.BufferDesc.Height || + format_changed) + { + if (p->backbuffer) { + PL_ERR(sw, "Tried resizing the swapchain while a frame was in " + "progress! Please submit the current frame first."); + return false; + } + + HRESULT hr = IDXGISwapChain_ResizeBuffers(p->swapchain, 0, w, h, + p->csp_map.d3d11_fmt, desc.Flags); + + if (hr == E_INVALIDARG && p->csp_map.d3d11_fmt != DXGI_FORMAT_R8G8B8A8_UNORM) + { + PL_WARN(sw, "Reconfiguring the swapchain failed, re-trying with R8G8B8A8_UNORM fallback."); + D3D(IDXGISwapChain_ResizeBuffers(p->swapchain, 0, w, h, + DXGI_FORMAT_R8G8B8A8_UNORM, desc.Flags)); + + // re-configure the colorspace to 8-bit RGB SDR fallback + p->csp_map = map_pl_csp_to_d3d11(&pl_color_space_unknown, true); + p->fallback_8bit_rgb = true; + } + else if (FAILED(hr)) + { + PL_ERR(sw, "Reconfiguring the swapchain failed with error: %s", pl_hresult_to_str(hr)); + return false; + } + } + + *width = w; + *height = h; + p->update_swapchain_format = false; + return true; + +error: + return false; +} + +static bool d3d11_sw_start_frame(pl_swapchain sw, + struct pl_swapchain_frame *out_frame) +{ + struct priv *p = PL_PRIV(sw); + struct d3d11_ctx *ctx = p->ctx; + + if (ctx->is_failed) + return false; + if (p->backbuffer) { + PL_ERR(sw, "Attempted calling `pl_swapchain_start_frame` while a frame " + "was already in progress! Call `pl_swapchain_submit_frame` first."); + return false; + } + + if (p->update_swapchain_format) { + int w = 0, h = 0; + if (!d3d11_sw_resize(sw, &w, &h)) + return false; + } + + p->backbuffer = get_backbuffer(sw); + if (!p->backbuffer) + return false; + + int bits = 0; + pl_fmt fmt = p->backbuffer->params.format; + for (int i = 0; i < fmt->num_components; i++) + bits = PL_MAX(bits, fmt->component_depth[i]); + + *out_frame = (struct pl_swapchain_frame) { + .fbo = p->backbuffer, + .flipped = false, + .color_repr = { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_FULL, + .alpha = PL_ALPHA_UNKNOWN, + .bits = { + .sample_depth = bits, + .color_depth = bits, + }, + }, + .color_space = p->csp_map.out_csp, + }; + + return true; +} + +static bool d3d11_sw_submit_frame(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + struct d3d11_ctx *ctx = p->ctx; + + // Release the backbuffer. We shouldn't hold onto it unnecessarily, because + // it prevents external code from resizing the swapchain, which we'd + // otherwise support just fine. + pl_tex_destroy(sw->gpu, &p->backbuffer); + + return !ctx->is_failed; +} + +static void d3d11_sw_swap_buffers(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + struct d3d11_ctx *ctx = p->ctx; + + // Present can fail with a device removed error + D3D(IDXGISwapChain_Present(p->swapchain, 1, 0)); + +error: + return; +} + +static DXGI_HDR_METADATA_HDR10 set_hdr10_metadata(const struct pl_hdr_metadata *hdr) +{ + return (DXGI_HDR_METADATA_HDR10) { + .RedPrimary = { roundf(hdr->prim.red.x * 50000), + roundf(hdr->prim.red.y * 50000) }, + .GreenPrimary = { roundf(hdr->prim.green.x * 50000), + roundf(hdr->prim.green.y * 50000) }, + .BluePrimary = { roundf(hdr->prim.blue.x * 50000), + roundf(hdr->prim.blue.y * 50000) }, + .WhitePoint = { roundf(hdr->prim.white.x * 50000), + roundf(hdr->prim.white.y * 50000) }, + .MaxMasteringLuminance = roundf(hdr->max_luma), + .MinMasteringLuminance = roundf(hdr->min_luma * 10000), + .MaxContentLightLevel = roundf(hdr->max_cll), + .MaxFrameAverageLightLevel = roundf(hdr->max_fall), + }; +} + +static bool set_swapchain_metadata(struct d3d11_ctx *ctx, + IDXGISwapChain3 *swapchain3, + struct d3d11_csp_mapping *csp_map) +{ + IDXGISwapChain4 *swapchain4 = NULL; + bool ret = false; + bool is_hdr = pl_color_space_is_hdr(&csp_map->out_csp); + DXGI_HDR_METADATA_HDR10 hdr10 = is_hdr ? + set_hdr10_metadata(&csp_map->out_csp.hdr) : (DXGI_HDR_METADATA_HDR10){ 0 }; + + D3D(IDXGISwapChain3_SetColorSpace1(swapchain3, csp_map->d3d11_csp)); + + // if we succeeded to set the color space, it's good enough, + // since older versions of Windows 10 will not have swapchain v4 available. + ret = true; + + if (FAILED(IDXGISwapChain3_QueryInterface(swapchain3, &IID_IDXGISwapChain4, + (void **)&swapchain4))) + { + PL_TRACE(ctx, "v4 swap chain interface is not available, skipping HDR10 " + "metadata configuration."); + goto error; + } + + D3D(IDXGISwapChain4_SetHDRMetaData(swapchain4, + is_hdr ? + DXGI_HDR_METADATA_TYPE_HDR10 : + DXGI_HDR_METADATA_TYPE_NONE, + is_hdr ? sizeof(hdr10) : 0, + is_hdr ? &hdr10 : NULL)); + + goto success; + +error: + csp_map->out_csp.hdr = (struct pl_hdr_metadata) { 0 }; +success: + SAFE_RELEASE(swapchain4); + return ret; +} + +static bool d3d11_format_supported(struct d3d11_ctx *ctx, DXGI_FORMAT fmt) +{ + UINT sup = 0; + UINT wanted_sup = + D3D11_FORMAT_SUPPORT_TEXTURE2D | D3D11_FORMAT_SUPPORT_DISPLAY | + D3D11_FORMAT_SUPPORT_SHADER_SAMPLE | D3D11_FORMAT_SUPPORT_RENDER_TARGET | + D3D11_FORMAT_SUPPORT_BLENDABLE; + + D3D(ID3D11Device_CheckFormatSupport(ctx->dev, fmt, &sup)); + + return (sup & wanted_sup) == wanted_sup; + +error: + return false; +} + +static bool d3d11_csp_supported(struct d3d11_ctx *ctx, + IDXGISwapChain3 *swapchain3, + DXGI_COLOR_SPACE_TYPE color_space) +{ + UINT csp_support_flags = 0; + + D3D(IDXGISwapChain3_CheckColorSpaceSupport(swapchain3, + color_space, + &csp_support_flags)); + + return (csp_support_flags & DXGI_SWAP_CHAIN_COLOR_SPACE_SUPPORT_FLAG_PRESENT); + +error: + return false; +} + +static void update_swapchain_color_config(pl_swapchain sw, + const struct pl_color_space *csp, + bool is_internal) +{ + struct priv *p = PL_PRIV(sw); + struct d3d11_ctx *ctx = p->ctx; + IDXGISwapChain3 *swapchain3 = NULL; + struct d3d11_csp_mapping old_map = p->csp_map; + + // ignore config changes in fallback mode + if (p->fallback_8bit_rgb) + goto cleanup; + + HRESULT hr = IDXGISwapChain_QueryInterface(p->swapchain, &IID_IDXGISwapChain3, + (void **)&swapchain3); + if (FAILED(hr)) { + PL_TRACE(ctx, "v3 swap chain interface is not available, skipping " + "color space configuration."); + swapchain3 = NULL; + } + + // Lack of swap chain v3 means we cannot control swap chain color space; + // Only effective formats are the 8 and 10 bit RGB ones. + struct d3d11_csp_mapping csp_map = + map_pl_csp_to_d3d11(swapchain3 ? csp : &pl_color_space_unknown, + p->disable_10bit_sdr); + + if (p->csp_map.d3d11_fmt == csp_map.d3d11_fmt && + p->csp_map.d3d11_csp == csp_map.d3d11_csp && + pl_color_space_equal(&p->csp_map.out_csp, &csp_map.out_csp)) + goto cleanup; + + PL_INFO(ctx, "%s swap chain configuration%s: format: %s, color space: %s.", + is_internal ? "Initial" : "New", + is_internal ? "" : " received from hint", + pl_get_dxgi_format_name(csp_map.d3d11_fmt), + pl_get_dxgi_csp_name(csp_map.d3d11_csp)); + + bool fmt_supported = d3d11_format_supported(ctx, csp_map.d3d11_fmt); + bool csp_supported = swapchain3 ? + d3d11_csp_supported(ctx, swapchain3, csp_map.d3d11_csp) : true; + if (!fmt_supported || !csp_supported) { + PL_ERR(ctx, "New swap chain configuration was deemed not supported: " + "format: %s, color space: %s. Failling back to 8bit RGB.", + fmt_supported ? "supported" : "unsupported", + csp_supported ? "supported" : "unsupported"); + // fall back to 8bit sRGB if requested configuration is not supported + csp_map = map_pl_csp_to_d3d11(&pl_color_space_unknown, true); + } + + p->csp_map = csp_map; + p->update_swapchain_format = true; + + if (!swapchain3) + goto cleanup; + + if (!set_swapchain_metadata(ctx, swapchain3, &p->csp_map)) { + // format succeeded, but color space configuration failed + p->csp_map = old_map; + p->csp_map.d3d11_fmt = csp_map.d3d11_fmt; + } + + pl_d3d11_flush_message_queue(ctx, "After colorspace hint"); + +cleanup: + SAFE_RELEASE(swapchain3); +} + +static void d3d11_sw_colorspace_hint(pl_swapchain sw, + const struct pl_color_space *csp) +{ + update_swapchain_color_config(sw, csp, false); +} + +IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + IDXGISwapChain_AddRef(p->swapchain); + return p->swapchain; +} + +static const struct pl_sw_fns d3d11_swapchain = { + .destroy = d3d11_sw_destroy, + .latency = d3d11_sw_latency, + .resize = d3d11_sw_resize, + .colorspace_hint = d3d11_sw_colorspace_hint, + .start_frame = d3d11_sw_start_frame, + .submit_frame = d3d11_sw_submit_frame, + .swap_buffers = d3d11_sw_swap_buffers, +}; + +static HRESULT create_swapchain_1_2(struct d3d11_ctx *ctx, + IDXGIFactory2 *factory, const struct pl_d3d11_swapchain_params *params, + bool flip, UINT width, UINT height, DXGI_FORMAT format, + IDXGISwapChain **swapchain_out) +{ + IDXGISwapChain *swapchain = NULL; + IDXGISwapChain1 *swapchain1 = NULL; + HRESULT hr; + + DXGI_SWAP_CHAIN_DESC1 desc = { + .Width = width, + .Height = height, + .Format = format, + .SampleDesc.Count = 1, + .BufferUsage = DXGI_USAGE_SHADER_INPUT | DXGI_USAGE_RENDER_TARGET_OUTPUT, + .Flags = params->flags, + }; + + if (ID3D11Device_GetFeatureLevel(ctx->dev) >= D3D_FEATURE_LEVEL_11_0) + desc.BufferUsage |= DXGI_USAGE_UNORDERED_ACCESS; + + if (flip) { + UINT max_latency; + IDXGIDevice1_GetMaximumFrameLatency(ctx->dxgi_dev, &max_latency); + + // Make sure we have at least enough buffers to allow `max_latency` + // frames in-flight at once, plus one frame for the frontbuffer + desc.BufferCount = max_latency + 1; + + if (IsWindows10OrGreater()) { + desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD; + } else { + desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL; + } + + desc.BufferCount = PL_MIN(desc.BufferCount, DXGI_MAX_SWAP_CHAIN_BUFFERS); + } else { + desc.SwapEffect = DXGI_SWAP_EFFECT_DISCARD; + desc.BufferCount = 1; + } + + if (params->window) { + hr = IDXGIFactory2_CreateSwapChainForHwnd(factory, (IUnknown *) ctx->dev, + params->window, &desc, NULL, NULL, &swapchain1); + } else if (params->core_window) { + hr = IDXGIFactory2_CreateSwapChainForCoreWindow(factory, + (IUnknown *) ctx->dev, params->core_window, &desc, NULL, &swapchain1); + } else { + hr = IDXGIFactory2_CreateSwapChainForComposition(factory, + (IUnknown *) ctx->dev, &desc, NULL, &swapchain1); + } + if (FAILED(hr)) + goto done; + hr = IDXGISwapChain1_QueryInterface(swapchain1, &IID_IDXGISwapChain, + (void **) &swapchain); + if (FAILED(hr)) + goto done; + + *swapchain_out = swapchain; + swapchain = NULL; + +done: + SAFE_RELEASE(swapchain1); + SAFE_RELEASE(swapchain); + return hr; +} + +static HRESULT create_swapchain_1_1(struct d3d11_ctx *ctx, + IDXGIFactory1 *factory, const struct pl_d3d11_swapchain_params *params, + UINT width, UINT height, DXGI_FORMAT format, IDXGISwapChain **swapchain_out) +{ + DXGI_SWAP_CHAIN_DESC desc = { + .BufferDesc = { + .Width = width, + .Height = height, + .Format = format, + }, + .SampleDesc.Count = 1, + .BufferUsage = DXGI_USAGE_SHADER_INPUT | DXGI_USAGE_RENDER_TARGET_OUTPUT, + .BufferCount = 1, + .OutputWindow = params->window, + .Windowed = TRUE, + .SwapEffect = DXGI_SWAP_EFFECT_DISCARD, + .Flags = params->flags, + }; + + return IDXGIFactory1_CreateSwapChain(factory, (IUnknown *) ctx->dev, &desc, + swapchain_out); +} + +static IDXGISwapChain *create_swapchain(struct d3d11_ctx *ctx, + const struct pl_d3d11_swapchain_params *params, DXGI_FORMAT format) +{ + IDXGIDevice1 *dxgi_dev = NULL; + IDXGIAdapter1 *adapter = NULL; + IDXGIFactory1 *factory = NULL; + IDXGIFactory2 *factory2 = NULL; + IDXGISwapChain *swapchain = NULL; + bool success = false; + HRESULT hr; + + D3D(ID3D11Device_QueryInterface(ctx->dev, &IID_IDXGIDevice1, + (void **) &dxgi_dev)); + D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter)); + D3D(IDXGIAdapter1_GetParent(adapter, &IID_IDXGIFactory1, (void **) &factory)); + + hr = IDXGIFactory1_QueryInterface(factory, &IID_IDXGIFactory2, + (void **) &factory2); + if (FAILED(hr)) + factory2 = NULL; + + bool flip = factory2 && !params->blit; + UINT width = PL_DEF(params->width, 1); + UINT height = PL_DEF(params->height, 1); + + // If both width and height are unset, the default size is the window size + if (params->window && params->width == 0 && params->height == 0) { + RECT rc; + if (GetClientRect(params->window, &rc)) { + width = PL_DEF(rc.right - rc.left, 1); + height = PL_DEF(rc.bottom - rc.top, 1); + } + } + + // Return here to retry creating the swapchain + do { + if (factory2) { + // Create a DXGI 1.2+ (Windows 8+) swap chain if possible + hr = create_swapchain_1_2(ctx, factory2, params, flip, width, + height, format, &swapchain); + } else { + // Fall back to DXGI 1.1 (Windows 7) + hr = create_swapchain_1_1(ctx, factory, params, width, height, + format, &swapchain); + } + if (SUCCEEDED(hr)) + break; + + pl_d3d11_after_error(ctx, hr); + if (flip) { + PL_DEBUG(ctx, "Failed to create flip-model swapchain, trying bitblt"); + flip = false; + continue; + } + + PL_FATAL(ctx, "Failed to create swapchain: %s", pl_hresult_to_str(hr)); + goto error; + } while (true); + + // Prevent DXGI from making changes to the window, otherwise it will hook + // the Alt+Enter keystroke and make it trigger an ugly transition to + // legacy exclusive fullscreen mode. + IDXGIFactory_MakeWindowAssociation(factory, params->window, + DXGI_MWA_NO_WINDOW_CHANGES | DXGI_MWA_NO_ALT_ENTER | + DXGI_MWA_NO_PRINT_SCREEN); + + success = true; +error: + if (!success) + SAFE_RELEASE(swapchain); + SAFE_RELEASE(factory2); + SAFE_RELEASE(factory); + SAFE_RELEASE(adapter); + SAFE_RELEASE(dxgi_dev); + return swapchain; +} + +pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11, + const struct pl_d3d11_swapchain_params *params) +{ + struct d3d11_ctx *ctx = PL_PRIV(d3d11); + pl_gpu gpu = d3d11->gpu; + bool success = false; + + struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv); + struct priv *p = PL_PRIV(sw); + *sw = (struct pl_swapchain_t) { + .log = gpu->log, + .gpu = gpu, + }; + *p = (struct priv) { + .impl = d3d11_swapchain, + .ctx = ctx, + // default to standard 8 or 10 bit RGB, unset pl_color_space + .csp_map = { + .d3d11_fmt = params->disable_10bit_sdr ? + DXGI_FORMAT_R8G8B8A8_UNORM : + (d3d11_format_supported(ctx, DXGI_FORMAT_R10G10B10A2_UNORM) ? + DXGI_FORMAT_R10G10B10A2_UNORM : DXGI_FORMAT_R8G8B8A8_UNORM), + }, + .disable_10bit_sdr = params->disable_10bit_sdr, + }; + + if (params->swapchain) { + p->swapchain = params->swapchain; + IDXGISwapChain_AddRef(params->swapchain); + } else { + p->swapchain = create_swapchain(ctx, params, p->csp_map.d3d11_fmt); + if (!p->swapchain) + goto error; + } + + DXGI_SWAP_CHAIN_DESC scd = {0}; + IDXGISwapChain_GetDesc(p->swapchain, &scd); + if (scd.SwapEffect == DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL || + scd.SwapEffect == DXGI_SWAP_EFFECT_FLIP_DISCARD) { + PL_INFO(gpu, "Using flip-model presentation"); + } else { + PL_INFO(gpu, "Using bitblt-model presentation"); + } + + p->csp_map.d3d11_fmt = scd.BufferDesc.Format; + + update_swapchain_color_config(sw, &pl_color_space_unknown, true); + + success = true; +error: + if (!success) { + PL_FATAL(gpu, "Failed to create Direct3D 11 swapchain"); + d3d11_sw_destroy(sw); + sw = NULL; + } + return sw; +} diff --git a/src/d3d11/utils.c b/src/d3d11/utils.c new file mode 100644 index 0000000..47154b5 --- /dev/null +++ b/src/d3d11/utils.c @@ -0,0 +1,500 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <string.h> + +#include "utils.h" + +// D3D11.3 message IDs, not present in mingw-w64 v9 +#define D3D11_MESSAGE_ID_CREATE_FENCE (0x30020c) +#define D3D11_MESSAGE_ID_DESTROY_FENCE (0x30020a) + +#ifdef PL_HAVE_DXGI_DEBUG +static enum pl_log_level log_level_override(unsigned int id) +{ + switch (id) { + // These warnings can happen when a pl_timer is used too often before a + // blocking pl_swapchain_swap_buffers() or pl_gpu_finish(), overflowing + // its internal ring buffer and causing older query objects to be reused + // before their results are read. This is expected behavior, so reduce + // the log level to PL_LOG_TRACE to prevent log spam. + case D3D11_MESSAGE_ID_QUERY_BEGIN_ABANDONING_PREVIOUS_RESULTS: + case D3D11_MESSAGE_ID_QUERY_END_ABANDONING_PREVIOUS_RESULTS: + return PL_LOG_TRACE; + + // D3D11 writes log messages every time an object is created or + // destroyed. That results in a lot of log spam, so force PL_LOG_TRACE. +#define OBJ_LIFETIME_MESSAGES(obj) \ + case D3D11_MESSAGE_ID_CREATE_ ## obj: \ + case D3D11_MESSAGE_ID_DESTROY_ ## obj + + OBJ_LIFETIME_MESSAGES(CONTEXT): + OBJ_LIFETIME_MESSAGES(BUFFER): + OBJ_LIFETIME_MESSAGES(TEXTURE1D): + OBJ_LIFETIME_MESSAGES(TEXTURE2D): + OBJ_LIFETIME_MESSAGES(TEXTURE3D): + OBJ_LIFETIME_MESSAGES(SHADERRESOURCEVIEW): + OBJ_LIFETIME_MESSAGES(RENDERTARGETVIEW): + OBJ_LIFETIME_MESSAGES(DEPTHSTENCILVIEW): + OBJ_LIFETIME_MESSAGES(VERTEXSHADER): + OBJ_LIFETIME_MESSAGES(HULLSHADER): + OBJ_LIFETIME_MESSAGES(DOMAINSHADER): + OBJ_LIFETIME_MESSAGES(GEOMETRYSHADER): + OBJ_LIFETIME_MESSAGES(PIXELSHADER): + OBJ_LIFETIME_MESSAGES(INPUTLAYOUT): + OBJ_LIFETIME_MESSAGES(SAMPLER): + OBJ_LIFETIME_MESSAGES(BLENDSTATE): + OBJ_LIFETIME_MESSAGES(DEPTHSTENCILSTATE): + OBJ_LIFETIME_MESSAGES(RASTERIZERSTATE): + OBJ_LIFETIME_MESSAGES(QUERY): + OBJ_LIFETIME_MESSAGES(PREDICATE): + OBJ_LIFETIME_MESSAGES(COUNTER): + OBJ_LIFETIME_MESSAGES(COMMANDLIST): + OBJ_LIFETIME_MESSAGES(CLASSINSTANCE): + OBJ_LIFETIME_MESSAGES(CLASSLINKAGE): + OBJ_LIFETIME_MESSAGES(COMPUTESHADER): + OBJ_LIFETIME_MESSAGES(UNORDEREDACCESSVIEW): + OBJ_LIFETIME_MESSAGES(VIDEODECODER): + OBJ_LIFETIME_MESSAGES(VIDEOPROCESSORENUM): + OBJ_LIFETIME_MESSAGES(VIDEOPROCESSOR): + OBJ_LIFETIME_MESSAGES(DECODEROUTPUTVIEW): + OBJ_LIFETIME_MESSAGES(PROCESSORINPUTVIEW): + OBJ_LIFETIME_MESSAGES(PROCESSOROUTPUTVIEW): + OBJ_LIFETIME_MESSAGES(DEVICECONTEXTSTATE): + OBJ_LIFETIME_MESSAGES(FENCE): + return PL_LOG_TRACE; + +#undef OBJ_LIFETIME_MESSAGES + + // Don't force the log level of any other messages. It will be mapped + // from the D3D severity code instead. + default: + return PL_LOG_NONE; + } +} +#endif + +void pl_d3d11_flush_message_queue(struct d3d11_ctx *ctx, const char *header) +{ +#ifdef PL_HAVE_DXGI_DEBUG + if (!ctx->iqueue) + return; + + static const enum pl_log_level severity_map[] = { + [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION] = PL_LOG_FATAL, + [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR] = PL_LOG_ERR, + [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_WARNING] = PL_LOG_WARN, + [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_INFO] = PL_LOG_DEBUG, + [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_MESSAGE] = PL_LOG_DEBUG, + }; + + enum pl_log_level header_printed = PL_LOG_NONE; + + // After the storage limit is reached and ID3D11InfoQueue::ClearStoredMessages + // is called message counter seems to be initialized to -1 which is quite big + // number if we read it as uint64_t. Any subsequent call to the + // ID3D11InfoQueue::GetNumStoredMessages will be off by one. + // Use ID3D11InfoQueue_GetNumStoredMessagesAllowedByRetrievalFilter without + // any filter set, which seem to be unaffected by this bug and return correct + // number of messages. + // IDXGIInfoQueue seems to be unaffected, but keep the same way of retrival + uint64_t messages = IDXGIInfoQueue_GetNumStoredMessagesAllowedByRetrievalFilters(ctx->iqueue, DXGI_DEBUG_ALL); + + // Just to be on the safe side, check also for the mentioned -1 value... + if (!messages || messages == UINT64_C(-1)) + return; + + uint64_t discarded = + IDXGIInfoQueue_GetNumMessagesDiscardedByMessageCountLimit(ctx->iqueue, DXGI_DEBUG_ALL); + if (discarded > ctx->last_discarded) { + PL_WARN(ctx, "%s:", header); + header_printed = PL_LOG_WARN; + + // Notify number of messages skipped due to the message count limit + PL_WARN(ctx, " (skipped %"PRIu64" debug layer messages)", + discarded - ctx->last_discarded); + ctx->last_discarded = discarded; + } + + // Copy debug layer messages to libplacebo's log output + for (uint64_t i = 0; i < messages; i++) { + SIZE_T len; + if (FAILED(IDXGIInfoQueue_GetMessage(ctx->iqueue, DXGI_DEBUG_ALL, i, NULL, &len))) + goto error; + + pl_grow((void *) ctx->d3d11, &ctx->dxgi_msg, len); + DXGI_INFO_QUEUE_MESSAGE *dxgi_msg = ctx->dxgi_msg; + + if (FAILED(IDXGIInfoQueue_GetMessage(ctx->iqueue, DXGI_DEBUG_ALL, i, dxgi_msg, &len))) + goto error; + + enum pl_log_level level = PL_LOG_NONE; + if (IsEqualGUID(&dxgi_msg->Producer, &DXGI_DEBUG_D3D11)) + level = log_level_override(dxgi_msg->ID); + if (level == PL_LOG_NONE) + level = severity_map[dxgi_msg->Severity]; + + if (pl_msg_test(ctx->log, level)) { + // If the header hasn't been printed, or it was printed for a lower + // log level than the current message, print it (again) + if (header_printed == PL_LOG_NONE || header_printed > level) { + PL_MSG(ctx, level, "%s:", header); + pl_log_stack_trace(ctx->log, level); + header_printed = level; + } + + PL_MSG(ctx, level, " %d: %.*s", (int) dxgi_msg->ID, + (int) dxgi_msg->DescriptionByteLength, dxgi_msg->pDescription); + } + + if (dxgi_msg->Severity <= DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR) + pl_debug_abort(); + } + +error: + IDXGIInfoQueue_ClearStoredMessages(ctx->iqueue, DXGI_DEBUG_ALL); +#endif +} + +HRESULT pl_d3d11_check_device_removed(struct d3d11_ctx *ctx, HRESULT hr) +{ + // This can be called before we have a device + if (!ctx->dev) + return hr; + + switch (hr) { + case DXGI_ERROR_DEVICE_HUNG: + case DXGI_ERROR_DEVICE_RESET: + case DXGI_ERROR_DRIVER_INTERNAL_ERROR: + ctx->is_failed = true; + break; + case D3DDDIERR_DEVICEREMOVED: + case DXGI_ERROR_DEVICE_REMOVED: + hr = ID3D11Device_GetDeviceRemovedReason(ctx->dev); + ctx->is_failed = true; + break; + } + if (ctx->is_failed) + PL_ERR(ctx, "Device lost!"); + return hr; +} + +HRESULT pl_d3d11_after_error(struct d3d11_ctx *ctx, HRESULT hr) +{ + hr = pl_d3d11_check_device_removed(ctx, hr); + pl_d3d11_flush_message_queue(ctx, "After error"); + return hr; +} + +struct dll_version pl_get_dll_version(const wchar_t *name) +{ + void *data = NULL; + struct dll_version ret = {0}; + + DWORD size = GetFileVersionInfoSizeW(name, &(DWORD) {0}); + if (!size) + goto error; + data = pl_alloc(NULL, size); + + if (!GetFileVersionInfoW(name, 0, size, data)) + goto error; + + VS_FIXEDFILEINFO *ffi; + UINT ffi_len; + if (!VerQueryValueW(data, L"\\", (void**)&ffi, &ffi_len)) + goto error; + if (ffi_len < sizeof(*ffi)) + goto error; + + ret = (struct dll_version) { + .major = HIWORD(ffi->dwFileVersionMS), + .minor = LOWORD(ffi->dwFileVersionMS), + .build = HIWORD(ffi->dwFileVersionLS), + .revision = LOWORD(ffi->dwFileVersionLS), + }; + +error: + pl_free(data); + return ret; +} + +wchar_t *pl_from_utf8(void *ctx, const char *str) +{ + int count = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0); + pl_assert(count > 0); + wchar_t *ret = pl_calloc_ptr(ctx, count, ret); + MultiByteToWideChar(CP_UTF8, 0, str, -1, ret, count); + return ret; +} + +char *pl_to_utf8(void *ctx, const wchar_t *str) +{ + int count = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL); + pl_assert(count > 0); + char *ret = pl_calloc_ptr(ctx, count, ret); + WideCharToMultiByte(CP_UTF8, 0, str, -1, ret, count, NULL, NULL); + return ret; +} + +static const char *hresult_str(HRESULT hr) +{ + switch (hr) { +#define CASE(name) case name: return #name + CASE(S_OK); + CASE(S_FALSE); + CASE(E_ABORT); + CASE(E_ACCESSDENIED); + CASE(E_FAIL); + CASE(E_HANDLE); + CASE(E_INVALIDARG); + CASE(E_NOINTERFACE); + CASE(E_NOTIMPL); + CASE(E_OUTOFMEMORY); + CASE(E_POINTER); + CASE(E_UNEXPECTED); + + CASE(DXGI_ERROR_ACCESS_DENIED); + CASE(DXGI_ERROR_ACCESS_LOST); + CASE(DXGI_ERROR_CANNOT_PROTECT_CONTENT); + CASE(DXGI_ERROR_DEVICE_HUNG); + CASE(DXGI_ERROR_DEVICE_REMOVED); + CASE(DXGI_ERROR_DEVICE_RESET); + CASE(DXGI_ERROR_DRIVER_INTERNAL_ERROR); + CASE(DXGI_ERROR_FRAME_STATISTICS_DISJOINT); + CASE(DXGI_ERROR_GRAPHICS_VIDPN_SOURCE_IN_USE); + CASE(DXGI_ERROR_INVALID_CALL); + CASE(DXGI_ERROR_MORE_DATA); + CASE(DXGI_ERROR_NAME_ALREADY_EXISTS); + CASE(DXGI_ERROR_NONEXCLUSIVE); + CASE(DXGI_ERROR_NOT_CURRENTLY_AVAILABLE); + CASE(DXGI_ERROR_NOT_FOUND); + CASE(DXGI_ERROR_REMOTE_CLIENT_DISCONNECTED); + CASE(DXGI_ERROR_REMOTE_OUTOFMEMORY); + CASE(DXGI_ERROR_RESTRICT_TO_OUTPUT_STALE); + CASE(DXGI_ERROR_SDK_COMPONENT_MISSING); + CASE(DXGI_ERROR_SESSION_DISCONNECTED); + CASE(DXGI_ERROR_UNSUPPORTED); + CASE(DXGI_ERROR_WAIT_TIMEOUT); + CASE(DXGI_ERROR_WAS_STILL_DRAWING); +#undef CASE + + default: + return "Unknown error"; + } +} + +static char *format_error(void *ctx, DWORD error) +{ + wchar_t *wstr; + if (!FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, NULL, error, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPWSTR)&wstr, 0, NULL)) + { + return NULL; + } + + // Trim any trailing newline from the message + for (int i = wcslen(wstr) - 1; i >= 0; i--) { + if (wstr[i] != '\r' && wstr[i] != '\n') { + wstr[i + 1] = '\0'; + break; + } + } + + char *str = pl_to_utf8(ctx, wstr); + LocalFree(wstr); + return str; +} + +char *pl_hresult_to_str_buf(char *buf, size_t buf_size, HRESULT hr) +{ + char *fmsg = format_error(NULL, hr); + const char *code = hresult_str(hr); + if (fmsg) { + snprintf(buf, buf_size, "%s (%s, 0x%08lx)", fmsg, code, hr); + } else { + snprintf(buf, buf_size, "%s, 0x%08lx", code, hr); + } + pl_free(fmsg); + return buf; +} + +#define D3D11_DXGI_ENUM(prefix, define) { case prefix ## define: return #define; } + +const char *pl_get_dxgi_format_name(DXGI_FORMAT fmt) +{ + switch (fmt) { + D3D11_DXGI_ENUM(DXGI_FORMAT_, UNKNOWN); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G8X24_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, D32_FLOAT_S8X24_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_FLOAT_X8X24_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, X32_TYPELESS_G8X24_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R11G11B10_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UNORM_SRGB); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, D32_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R24G8_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, D24_UNORM_S8_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R24_UNORM_X8_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, X24_TYPELESS_G8_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_FLOAT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, D16_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_UINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_SINT); + D3D11_DXGI_ENUM(DXGI_FORMAT_, A8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R1_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R9G9B9E5_SHAREDEXP); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_B8G8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, G8R8_G8B8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_UNORM_SRGB); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_UNORM_SRGB); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_UNORM_SRGB); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_SNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B5G6R5_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B5G5R5A1_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10_XR_BIAS_A2_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_UNORM_SRGB); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_UNORM_SRGB); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_UF16); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_SF16); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_TYPELESS); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_UNORM_SRGB); + D3D11_DXGI_ENUM(DXGI_FORMAT_, AYUV); + D3D11_DXGI_ENUM(DXGI_FORMAT_, Y410); + D3D11_DXGI_ENUM(DXGI_FORMAT_, Y416); + D3D11_DXGI_ENUM(DXGI_FORMAT_, NV12); + D3D11_DXGI_ENUM(DXGI_FORMAT_, P010); + D3D11_DXGI_ENUM(DXGI_FORMAT_, P016); + D3D11_DXGI_ENUM(DXGI_FORMAT_, 420_OPAQUE); + D3D11_DXGI_ENUM(DXGI_FORMAT_, YUY2); + D3D11_DXGI_ENUM(DXGI_FORMAT_, Y210); + D3D11_DXGI_ENUM(DXGI_FORMAT_, Y216); + D3D11_DXGI_ENUM(DXGI_FORMAT_, NV11); + D3D11_DXGI_ENUM(DXGI_FORMAT_, AI44); + D3D11_DXGI_ENUM(DXGI_FORMAT_, IA44); + D3D11_DXGI_ENUM(DXGI_FORMAT_, P8); + D3D11_DXGI_ENUM(DXGI_FORMAT_, A8P8); + D3D11_DXGI_ENUM(DXGI_FORMAT_, B4G4R4A4_UNORM); + D3D11_DXGI_ENUM(DXGI_FORMAT_, P208); + D3D11_DXGI_ENUM(DXGI_FORMAT_, V208); + D3D11_DXGI_ENUM(DXGI_FORMAT_, V408); + D3D11_DXGI_ENUM(DXGI_FORMAT_, FORCE_UINT); + } + + return "<unknown>"; +} + +const char *pl_get_dxgi_csp_name(DXGI_COLOR_SPACE_TYPE csp) +{ + switch ((int) csp) { + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G22_NONE_P709); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G10_NONE_P709); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G22_NONE_P709); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G22_NONE_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RESERVED); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_NONE_P709_X601); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P601); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P601); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P709); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P709); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G2084_NONE_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G2084_LEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G2084_NONE_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_TOPLEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G2084_TOPLEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G22_NONE_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_GHLG_TOPLEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_GHLG_TOPLEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G24_NONE_P709); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G24_NONE_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_LEFT_P709); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_LEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_TOPLEFT_P2020); + D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, CUSTOM); + } + + return "<unknown>"; +} diff --git a/src/d3d11/utils.h b/src/d3d11/utils.h new file mode 100644 index 0000000..86b4072 --- /dev/null +++ b/src/d3d11/utils.h @@ -0,0 +1,88 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +#define DXGI_COLOR_SPACE_RGB_STUDIO_G24_NONE_P709 ((DXGI_COLOR_SPACE_TYPE)20) +#define DXGI_COLOR_SPACE_RGB_STUDIO_G24_NONE_P2020 ((DXGI_COLOR_SPACE_TYPE)21) +#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_LEFT_P709 ((DXGI_COLOR_SPACE_TYPE)22) +#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_LEFT_P2020 ((DXGI_COLOR_SPACE_TYPE)23) +#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_TOPLEFT_P2020 ((DXGI_COLOR_SPACE_TYPE)24) + +// Flush debug messages from D3D11's info queue to libplacebo's log output. +// Should be called regularly. +void pl_d3d11_flush_message_queue(struct d3d11_ctx *ctx, const char *header); + +// Some D3D11 functions can fail with a set of HRESULT codes which indicate the +// device has been removed. This is equivalent to libplacebo's gpu_is_failed +// state and indicates that the pl_gpu needs to be recreated. This function +// checks for one of those HRESULTs, sets the failed state, and returns a +// specific HRESULT that indicates why the device was removed (eg. GPU hang, +// driver crash, etc.) +HRESULT pl_d3d11_check_device_removed(struct d3d11_ctx *ctx, HRESULT hr); + +// Helper function for the D3D() macro, though it can be called directly when +// handling D3D11 errors if the D3D() macro isn't suitable for some reason. +// Calls `pl_d3d11_check_device_removed` and `pl_d3d11_drain_debug_messages` and +// returns the specific HRESULT from `pl_d3d11_check_device_removed` for logging +// purposes. +HRESULT pl_d3d11_after_error(struct d3d11_ctx *ctx, HRESULT hr); + +// Convenience macro for running DXGI/D3D11 functions and performing appropriate +// actions on failure. Can also be used for any HRESULT-returning function. +#define D3D(call) \ + do { \ + HRESULT hr_ = (call); \ + if (FAILED(hr_)) { \ + hr_ = pl_d3d11_after_error(ctx, hr_); \ + PL_ERR(ctx, "%s: %s (%s:%d)", #call, pl_hresult_to_str(hr_), \ + __FILE__, __LINE__); \ + goto error; \ + } \ + } while (0); + +// Conditionally release a COM interface and set the pointer to NULL +#define SAFE_RELEASE(iface) \ + do { \ + if (iface) \ + (iface)->lpVtbl->Release(iface); \ + (iface) = NULL; \ + } while (0) + +struct dll_version { + uint16_t major; + uint16_t minor; + uint16_t build; + uint16_t revision; +}; + +// Get the version number of a DLL. This calls GetFileVersionInfoW, which should +// call LoadLibraryExW internally, so it should get the same copy of the DLL +// that is loaded into memory if there is a copy in System32 and a copy in the +// %PATH% or application directory. +struct dll_version pl_get_dll_version(const wchar_t *name); + +wchar_t *pl_from_utf8(void *ctx, const char *str); +char *pl_to_utf8(void *ctx, const wchar_t *str); + +#define pl_hresult_to_str(hr) pl_hresult_to_str_buf((char[256]){0}, 256, (hr)) +char *pl_hresult_to_str_buf(char *buf, size_t buf_size, HRESULT hr); + +const char *pl_get_dxgi_csp_name(DXGI_COLOR_SPACE_TYPE csp); +const char *pl_get_dxgi_format_name(DXGI_FORMAT fmt); diff --git a/src/dispatch.c b/src/dispatch.c new file mode 100644 index 0000000..308dd56 --- /dev/null +++ b/src/dispatch.c @@ -0,0 +1,1615 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "log.h" +#include "shaders.h" +#include "dispatch.h" +#include "gpu.h" +#include "pl_thread.h" + +// Maximum number of passes to keep around at once. If full, passes older than +// MIN_AGE are evicted to make room. (Failing that, the passes array doubles) +#define MAX_PASSES 100 +#define MIN_AGE 10 + +enum { + TMP_PRELUDE, // GLSL version, global definitions, etc. + TMP_MAIN, // main GLSL shader body + TMP_VERT_HEAD, // vertex shader inputs/outputs + TMP_VERT_BODY, // vertex shader body + TMP_COUNT, +}; + +struct pl_dispatch_t { + pl_mutex lock; + pl_log log; + pl_gpu gpu; + uint8_t current_ident; + uint8_t current_index; + bool dynamic_constants; + int max_passes; + + void (*info_callback)(void *, const struct pl_dispatch_info *); + void *info_priv; + + PL_ARRAY(pl_shader) shaders; // to avoid re-allocations + PL_ARRAY(struct pass *) passes; // compiled passes + + // temporary buffers to help avoid re_allocations during pass creation + PL_ARRAY(const struct pl_buffer_var *) buf_tmp; + pl_str_builder tmp[TMP_COUNT]; + uint8_t *ubo_tmp; +}; + +enum pass_var_type { + PASS_VAR_NONE = 0, + PASS_VAR_GLOBAL, // regular/global uniforms + PASS_VAR_UBO, // uniform buffers + PASS_VAR_PUSHC // push constants +}; + +// Cached metadata about a variable's effective placement / update method +struct pass_var { + int index; // for pl_var_update + enum pass_var_type type; + struct pl_var_layout layout; + void *cached_data; +}; + +struct pass { + uint64_t signature; + pl_pass pass; + int last_index; + + // contains cached data and update metadata, same order as pl_shader + struct pass_var *vars; + int num_var_locs; + + // for uniform buffer updates + struct pl_shader_desc ubo_desc; // temporary + int ubo_index; + pl_buf ubo; + + // Cached pl_pass_run_params. This will also contain mutable allocations + // for the push constants, descriptor bindings (including the binding for + // the UBO pre-filled), vertex array and variable updates + struct pl_pass_run_params run_params; + + // for pl_dispatch_info + pl_timer timer; + uint64_t ts_last; + uint64_t ts_peak; + uint64_t ts_sum; + uint64_t samples[PL_ARRAY_SIZE(((struct pl_dispatch_info *) NULL)->samples)]; + int ts_idx; +}; + +static void pass_destroy(pl_dispatch dp, struct pass *pass) +{ + if (!pass) + return; + + pl_buf_destroy(dp->gpu, &pass->ubo); + pl_pass_destroy(dp->gpu, &pass->pass); + pl_timer_destroy(dp->gpu, &pass->timer); + pl_free(pass); +} + +pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu) +{ + struct pl_dispatch_t *dp = pl_zalloc_ptr(NULL, dp); + pl_mutex_init(&dp->lock); + dp->log = log; + dp->gpu = gpu; + dp->max_passes = MAX_PASSES; + for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++) + dp->tmp[i] = pl_str_builder_alloc(dp); + + return dp; +} + +void pl_dispatch_destroy(pl_dispatch *ptr) +{ + pl_dispatch dp = *ptr; + if (!dp) + return; + + for (int i = 0; i < dp->passes.num; i++) + pass_destroy(dp, dp->passes.elem[i]); + for (int i = 0; i < dp->shaders.num; i++) + pl_shader_free(&dp->shaders.elem[i]); + + pl_mutex_destroy(&dp->lock); + pl_free(dp); + *ptr = NULL; +} + +pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique) +{ + pl_mutex_lock(&dp->lock); + + struct pl_shader_params params = { + .id = unique ? dp->current_ident++ : 0, + .gpu = dp->gpu, + .index = dp->current_index, + .dynamic_constants = dp->dynamic_constants, + }; + + pl_shader sh = NULL; + PL_ARRAY_POP(dp->shaders, &sh); + pl_mutex_unlock(&dp->lock); + + if (sh) { + pl_shader_reset(sh, ¶ms); + return sh; + } + + return pl_shader_alloc(dp->log, ¶ms); +} + +void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic) +{ + dp->dynamic_constants = dynamic; +} + +void pl_dispatch_callback(pl_dispatch dp, void *priv, + void (*cb)(void *priv, const struct pl_dispatch_info *)) +{ + dp->info_callback = cb; + dp->info_priv = priv; +} + +pl_shader pl_dispatch_begin(pl_dispatch dp) +{ + return pl_dispatch_begin_ex(dp, false); +} + +static bool add_pass_var(pl_dispatch dp, void *tmp, struct pass *pass, + struct pl_pass_params *params, + const struct pl_shader_var *sv, struct pass_var *pv, + bool greedy) +{ + pl_gpu gpu = dp->gpu; + if (pv->type) + return true; + + // Try not to use push constants for "large" values like matrices in the + // first pass, since this is likely to exceed the VGPR/pushc size budgets + bool try_pushc = greedy || (sv->var.dim_m == 1 && sv->var.dim_a == 1) || sv->dynamic; + if (try_pushc && gpu->glsl.vulkan && gpu->limits.max_pushc_size) { + pv->layout = pl_std430_layout(params->push_constants_size, &sv->var); + size_t new_size = pv->layout.offset + pv->layout.size; + if (new_size <= gpu->limits.max_pushc_size) { + params->push_constants_size = new_size; + pv->type = PASS_VAR_PUSHC; + return true; + } + } + + // If we haven't placed all PCs yet, don't place anything else, since + // we want to try and fit more stuff into PCs before "giving up" + if (!greedy) + return true; + + int num_locs = sv->var.dim_v * sv->var.dim_m * sv->var.dim_a; + bool can_var = pass->num_var_locs + num_locs <= gpu->limits.max_variable_comps; + + // Attempt using uniform buffer next. The GLSL version 440 check is due + // to explicit offsets on UBO entries. In theory we could leave away + // the offsets and support UBOs for older GL as well, but this is a nice + // safety net for driver bugs (and also rules out potentially buggy drivers) + // Also avoid UBOs for highly dynamic stuff since that requires synchronizing + // the UBO writes every frame + bool try_ubo = !can_var || !sv->dynamic; + if (try_ubo && gpu->glsl.version >= 440 && gpu->limits.max_ubo_size) { + if (sh_buf_desc_append(tmp, gpu, &pass->ubo_desc, &pv->layout, sv->var)) { + pv->type = PASS_VAR_UBO; + return true; + } + } + + // Otherwise, use global uniforms + if (can_var) { + pv->type = PASS_VAR_GLOBAL; + pv->index = params->num_variables; + pv->layout = pl_var_host_layout(0, &sv->var); + PL_ARRAY_APPEND_RAW(tmp, params->variables, params->num_variables, sv->var); + pass->num_var_locs += num_locs; + return true; + } + + // Ran out of variable binding methods. The most likely scenario in which + // this can happen is if we're using a GPU that does not support global + // input vars and we've exhausted the UBO size limits. + PL_ERR(dp, "Unable to add input variable: possibly exhausted " + "variable count / UBO size limits?"); + return false; +} + +#define ADD(b, ...) pl_str_builder_addf(b, __VA_ARGS__) +#define ADD_CAT(b, cat) pl_str_builder_concat(b, cat) +#define ADD_CONST(b, s) pl_str_builder_const_str(b, s) + +static void add_var(pl_str_builder body, const struct pl_var *var) +{ + const char *type = pl_var_glsl_type_name(*var); + if (var->dim_a > 1) { + ADD(body, "%s "$"[%d];\n", type, sh_ident_unpack(var->name), var->dim_a); + } else { + ADD(body, "%s "$";\n", type, sh_ident_unpack(var->name)); + } +} + +static int cmp_buffer_var(const void *pa, const void *pb) +{ + const struct pl_buffer_var * const *a = pa, * const *b = pb; + return PL_CMP((*a)->layout.offset, (*b)->layout.offset); +} + +static void add_buffer_vars(pl_dispatch dp, void *tmp, pl_str_builder body, + const struct pl_buffer_var *vars, int num) +{ + // Sort buffer vars by offset + PL_ARRAY_RESIZE(dp, dp->buf_tmp, num); + for (int i = 0; i < num; i++) + dp->buf_tmp.elem[i] = &vars[i]; + qsort(dp->buf_tmp.elem, num, sizeof(&vars[0]), cmp_buffer_var); + + ADD(body, "{\n"); + for (int i = 0; i < num; i++) { + const struct pl_buffer_var *bv = dp->buf_tmp.elem[i]; + // Add an explicit offset wherever possible + if (dp->gpu->glsl.version >= 440) + ADD(body, " layout(offset=%zu) ", bv->layout.offset); + add_var(body, &bv->var); + } + ADD(body, "};\n"); +} + +struct generate_params { + void *tmp; + pl_shader sh; + struct pass *pass; + struct pl_pass_params *pass_params; + ident_t out_mat; + ident_t out_off; + int vert_idx; +}; + +static void generate_shaders(pl_dispatch dp, + const struct generate_params *params, + pl_str_builder *out_vert_builder, + pl_str_builder *out_glsl_builder) +{ + pl_gpu gpu = dp->gpu; + pl_shader sh = params->sh; + void *tmp = params->tmp; + struct pass *pass = params->pass; + struct pl_pass_params *pass_params = params->pass_params; + pl_str_builder shader_body = sh_finalize_internal(sh); + + pl_str_builder pre = dp->tmp[TMP_PRELUDE]; + ADD(pre, "#version %d%s\n", gpu->glsl.version, + (gpu->glsl.gles && gpu->glsl.version > 100) ? " es" : ""); + if (pass_params->type == PL_PASS_COMPUTE) + ADD(pre, "#extension GL_ARB_compute_shader : enable\n"); + + // Enable this unconditionally if the GPU supports it, since we have no way + // of knowing whether subgroups are being used or not + if (gpu->glsl.subgroup_size) { + ADD(pre, "#extension GL_KHR_shader_subgroup_basic : enable \n" + "#extension GL_KHR_shader_subgroup_vote : enable \n" + "#extension GL_KHR_shader_subgroup_arithmetic : enable \n" + "#extension GL_KHR_shader_subgroup_ballot : enable \n" + "#extension GL_KHR_shader_subgroup_shuffle : enable \n" + "#extension GL_KHR_shader_subgroup_clustered : enable \n" + "#extension GL_KHR_shader_subgroup_quad : enable \n"); + } + + // Enable all extensions needed for different types of input + bool has_ssbo = false, has_ubo = false, has_img = false, has_texel = false, + has_ext = false, has_nofmt = false, has_gather = false; + for (int i = 0; i < sh->descs.num; i++) { + switch (sh->descs.elem[i].desc.type) { + case PL_DESC_BUF_UNIFORM: has_ubo = true; break; + case PL_DESC_BUF_STORAGE: has_ssbo = true; break; + case PL_DESC_BUF_TEXEL_UNIFORM: has_texel = true; break; + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = sh->descs.elem[i].binding.object; + has_nofmt |= !buf->params.format->glsl_format; + has_texel = true; + break; + } + case PL_DESC_STORAGE_IMG: { + pl_tex tex = sh->descs.elem[i].binding.object; + has_nofmt |= !tex->params.format->glsl_format; + has_img = true; + break; + } + case PL_DESC_SAMPLED_TEX: { + pl_tex tex = sh->descs.elem[i].binding.object; + has_gather |= tex->params.format->gatherable; + switch (tex->sampler_type) { + case PL_SAMPLER_NORMAL: break; + case PL_SAMPLER_RECT: break; + case PL_SAMPLER_EXTERNAL: has_ext = true; break; + case PL_SAMPLER_TYPE_COUNT: pl_unreachable(); + } + break; + } + + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + } + + if (has_img) + ADD(pre, "#extension GL_ARB_shader_image_load_store : enable\n"); + if (has_ubo) + ADD(pre, "#extension GL_ARB_uniform_buffer_object : enable\n"); + if (has_ssbo) + ADD(pre, "#extension GL_ARB_shader_storage_buffer_object : enable\n"); + if (has_texel) + ADD(pre, "#extension GL_ARB_texture_buffer_object : enable\n"); + if (has_ext) { + if (gpu->glsl.version >= 300) { + ADD(pre, "#extension GL_OES_EGL_image_external_essl3 : enable\n"); + } else { + ADD(pre, "#extension GL_OES_EGL_image_external : enable\n"); + } + } + if (has_nofmt) + ADD(pre, "#extension GL_EXT_shader_image_load_formatted : enable\n"); + if (has_gather) + ADD(pre, "#extension GL_ARB_texture_gather : enable\n"); + + if (gpu->glsl.gles) { + // Use 32-bit precision for floats if possible + ADD(pre, "#ifdef GL_FRAGMENT_PRECISION_HIGH \n" + "precision highp float; \n" + "#else \n" + "precision mediump float; \n" + "#endif \n"); + + // Always use 16-bit precision for samplers + ADD(pre, "precision mediump sampler2D; \n"); + if (gpu->limits.max_tex_1d_dim) + ADD(pre, "precision mediump sampler1D; \n"); + if (gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100) + ADD(pre, "precision mediump sampler3D; \n"); + + // Integer math has a good chance of caring about precision + ADD(pre, "precision highp int; \n"); + } + + // textureLod() doesn't work on external/rect samplers, simply disable + // LOD sampling in this case. We don't currently support mipmaps anyway. + for (int i = 0; i < sh->descs.num; i++) { + if (pass_params->descriptors[i].type != PL_DESC_SAMPLED_TEX) + continue; + pl_tex tex = sh->descs.elem[i].binding.object; + if (tex->sampler_type != PL_SAMPLER_NORMAL) { + ADD(pre, "#define textureLod(t, p, b) texture(t, p) \n" + "#define textureLodOffset(t, p, b, o) \\\n" + " textureOffset(t, p, o) \n"); + break; + } + } + + // Add all of the push constants as their own element + if (pass_params->push_constants_size) { + // We re-use add_buffer_vars to make sure variables are sorted, this + // is important because the push constants can be out-of-order in + // `pass->vars` + PL_ARRAY(struct pl_buffer_var) pc_bvars = {0}; + for (int i = 0; i < sh->vars.num; i++) { + if (pass->vars[i].type != PASS_VAR_PUSHC) + continue; + + PL_ARRAY_APPEND(tmp, pc_bvars, (struct pl_buffer_var) { + .var = sh->vars.elem[i].var, + .layout = pass->vars[i].layout, + }); + } + + ADD(pre, "layout(std430, push_constant) uniform PushC "); + add_buffer_vars(dp, tmp, pre, pc_bvars.elem, pc_bvars.num); + } + + // Add all of the specialization constants + for (int i = 0; i < sh->consts.num; i++) { + static const char *types[PL_VAR_TYPE_COUNT] = { + [PL_VAR_SINT] = "int", + [PL_VAR_UINT] = "uint", + [PL_VAR_FLOAT] = "float", + }; + + const struct pl_shader_const *sc = &sh->consts.elem[i]; + ADD(pre, "layout(constant_id=%"PRIu32") const %s "$" = 1; \n", + pass_params->constants[i].id, types[sc->type], + sh_ident_unpack(sc->name)); + } + + static const char sampler_prefixes[PL_FMT_TYPE_COUNT] = { + [PL_FMT_FLOAT] = ' ', + [PL_FMT_UNORM] = ' ', + [PL_FMT_SNORM] = ' ', + [PL_FMT_UINT] = 'u', + [PL_FMT_SINT] = 'i', + }; + + // Add all of the required descriptors + for (int i = 0; i < sh->descs.num; i++) { + const struct pl_shader_desc *sd = &sh->descs.elem[i]; + const struct pl_desc *desc = &pass_params->descriptors[i]; + + switch (desc->type) { + case PL_DESC_SAMPLED_TEX: { + static const char *types[][4] = { + [PL_SAMPLER_NORMAL][1] = "sampler1D", + [PL_SAMPLER_NORMAL][2] = "sampler2D", + [PL_SAMPLER_NORMAL][3] = "sampler3D", + [PL_SAMPLER_RECT][2] = "sampler2DRect", + [PL_SAMPLER_EXTERNAL][2] = "samplerExternalOES", + }; + + pl_tex tex = sd->binding.object; + int dims = pl_tex_params_dimension(tex->params); + const char *type = types[tex->sampler_type][dims]; + char prefix = sampler_prefixes[tex->params.format->type]; + ident_t id = sh_ident_unpack(desc->name); + pl_assert(type && prefix); + + // Vulkan requires explicit bindings; GL always sets the + // bindings manually to avoid relying on the user doing so + if (gpu->glsl.vulkan) { + ADD(pre, "layout(binding=%d) uniform %c%s "$";\n", + desc->binding, prefix, type, id); + } else if (gpu->glsl.gles && prefix != ' ') { + ADD(pre, "uniform highp %c%s "$";\n", prefix, type, id); + } else { + ADD(pre, "uniform %c%s "$";\n", prefix, type, id); + } + break; + } + + case PL_DESC_STORAGE_IMG: { + static const char *types[] = { + [1] = "image1D", + [2] = "image2D", + [3] = "image3D", + }; + + // For better compatibility, we have to explicitly label the + // type of data we will be reading/writing to this image. + pl_tex tex = sd->binding.object; + const char *format = tex->params.format->glsl_format; + int dims = pl_tex_params_dimension(tex->params); + if (gpu->glsl.vulkan) { + if (format) { + ADD(pre, "layout(binding=%d, %s) ", desc->binding, format); + } else { + ADD(pre, "layout(binding=%d) ", desc->binding); + } + } else if (format) { + ADD(pre, "layout(%s) ", format); + } + + ADD_CONST(pre, pl_desc_access_glsl_name(desc->access)); + if (sd->memory & PL_MEMORY_COHERENT) + ADD(pre, " coherent"); + if (sd->memory & PL_MEMORY_VOLATILE) + ADD(pre, " volatile"); + ADD(pre, " restrict uniform %s "$";\n", + types[dims], sh_ident_unpack(desc->name)); + break; + } + + case PL_DESC_BUF_UNIFORM: + if (gpu->glsl.vulkan) { + ADD(pre, "layout(std140, binding=%d) ", desc->binding); + } else { + ADD(pre, "layout(std140) "); + } + ADD(pre, "uniform "$" ", sh_ident_unpack(desc->name)); + add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars); + break; + + case PL_DESC_BUF_STORAGE: + if (gpu->glsl.version >= 140) + ADD(pre, "layout(std430, binding=%d) ", desc->binding); + ADD_CONST(pre, pl_desc_access_glsl_name(desc->access)); + if (sd->memory & PL_MEMORY_COHERENT) + ADD(pre, " coherent"); + if (sd->memory & PL_MEMORY_VOLATILE) + ADD(pre, " volatile"); + ADD(pre, " restrict buffer "$" ", sh_ident_unpack(desc->name)); + add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars); + break; + + case PL_DESC_BUF_TEXEL_UNIFORM: { + pl_buf buf = sd->binding.object; + char prefix = sampler_prefixes[buf->params.format->type]; + if (gpu->glsl.vulkan) + ADD(pre, "layout(binding=%d) ", desc->binding); + ADD(pre, "uniform %csamplerBuffer "$";\n", prefix, + sh_ident_unpack(desc->name)); + break; + } + + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = sd->binding.object; + const char *format = buf->params.format->glsl_format; + char prefix = sampler_prefixes[buf->params.format->type]; + if (gpu->glsl.vulkan) { + if (format) { + ADD(pre, "layout(binding=%d, %s) ", desc->binding, format); + } else { + ADD(pre, "layout(binding=%d) ", desc->binding); + } + } else if (format) { + ADD(pre, "layout(%s) ", format); + } + + ADD_CONST(pre, pl_desc_access_glsl_name(desc->access)); + if (sd->memory & PL_MEMORY_COHERENT) + ADD(pre, " coherent"); + if (sd->memory & PL_MEMORY_VOLATILE) + ADD(pre, " volatile"); + ADD(pre, " restrict uniform %cimageBuffer "$";\n", + prefix, sh_ident_unpack(desc->name)); + break; + } + + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + } + + // Add all of the remaining variables + for (int i = 0; i < sh->vars.num; i++) { + const struct pl_var *var = &sh->vars.elem[i].var; + const struct pass_var *pv = &pass->vars[i]; + if (pv->type != PASS_VAR_GLOBAL) + continue; + ADD(pre, "uniform "); + add_var(pre, var); + } + + pl_str_builder glsl = dp->tmp[TMP_MAIN]; + ADD_CAT(glsl, pre); + + switch(pass_params->type) { + case PL_PASS_RASTER: { + pl_assert(params->vert_idx >= 0); + pl_str_builder vert_head = dp->tmp[TMP_VERT_HEAD]; + pl_str_builder vert_body = dp->tmp[TMP_VERT_BODY]; + + // Older GLSL doesn't support the use of explicit locations + bool has_loc = gpu->glsl.version >= 430; + + // Set up a trivial vertex shader + ADD_CAT(vert_head, pre); + ADD(vert_body, "void main() {\n"); + for (int i = 0; i < sh->vas.num; i++) { + const struct pl_vertex_attrib *va = &pass_params->vertex_attribs[i]; + const struct pl_shader_va *sva = &sh->vas.elem[i]; + const char *type = va->fmt->glsl_type; + + // Use the pl_shader_va for the name in the fragment shader since + // the pl_vertex_attrib is already mangled for the vertex shader + ident_t id = sh_ident_unpack(sva->attr.name); + + if (has_loc) { + ADD(vert_head, "layout(location=%d) in %s "$";\n", + va->location, type, sh_ident_unpack(va->name)); + } else { + ADD(vert_head, "in %s "$";\n", type, sh_ident_unpack(va->name)); + } + + if (i == params->vert_idx) { + pl_assert(va->fmt->num_components == 2); + ADD(vert_body, "vec2 va_pos = "$"; \n", sh_ident_unpack(va->name)); + if (params->out_mat) + ADD(vert_body, "va_pos = "$" * va_pos; \n", params->out_mat); + if (params->out_off) + ADD(vert_body, "va_pos += "$"; \n", params->out_off); + ADD(vert_body, "gl_Position = vec4(va_pos, 0.0, 1.0); \n"); + } else { + // Everything else is just blindly passed through + if (has_loc) { + ADD(vert_head, "layout(location=%d) out %s "$";\n", + va->location, type, id); + ADD(glsl, "layout(location=%d) in %s "$";\n", + va->location, type, id); + } else { + ADD(vert_head, "out %s "$";\n", type, id); + ADD(glsl, "in %s "$";\n", type, id); + } + ADD(vert_body, $" = "$";\n", id, sh_ident_unpack(va->name)); + } + } + + ADD(vert_body, "}"); + ADD_CAT(vert_head, vert_body); + pl_hash_merge(&pass->signature, pl_str_builder_hash(vert_head)); + *out_vert_builder = vert_head; + + if (has_loc) { + ADD(glsl, "layout(location=0) out vec4 out_color;\n"); + } else { + ADD(glsl, "out vec4 out_color;\n"); + } + break; + } + case PL_PASS_COMPUTE: + ADD(glsl, "layout (local_size_x = %d, local_size_y = %d) in;\n", + sh->group_size[0], sh->group_size[1]); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + // Set up the main shader body + ADD_CAT(glsl, shader_body); + ADD(glsl, "void main() {\n"); + + pl_assert(sh->input == PL_SHADER_SIG_NONE); + switch (pass_params->type) { + case PL_PASS_RASTER: + pl_assert(sh->output == PL_SHADER_SIG_COLOR); + ADD(glsl, "out_color = "$"();\n", sh->name); + break; + case PL_PASS_COMPUTE: + ADD(glsl, $"();\n", sh->name); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + ADD(glsl, "}"); + + pl_hash_merge(&pass->signature, pl_str_builder_hash(glsl)); + *out_glsl_builder = glsl; +} + +#undef ADD +#undef ADD_CAT + +#define pass_age(pass) (dp->current_index - (pass)->last_index) + +static int cmp_pass_age(const void *ptra, const void *ptrb) +{ + const struct pass *a = *(const struct pass **) ptra; + const struct pass *b = *(const struct pass **) ptrb; + return b->last_index - a->last_index; +} + +static void garbage_collect_passes(pl_dispatch dp) +{ + if (dp->passes.num <= dp->max_passes) + return; + + // Garbage collect oldest passes, starting at the middle + qsort(dp->passes.elem, dp->passes.num, sizeof(struct pass *), cmp_pass_age); + int idx = dp->passes.num / 2; + while (idx < dp->passes.num && pass_age(dp->passes.elem[idx]) < MIN_AGE) + idx++; + + for (int i = idx; i < dp->passes.num; i++) + pass_destroy(dp, dp->passes.elem[i]); + + int num_evicted = dp->passes.num - idx; + dp->passes.num = idx; + + if (num_evicted) { + PL_DEBUG(dp, "Evicted %d passes from dispatch cache, consider " + "using more dynamic shaders", num_evicted); + } else { + dp->max_passes *= 2; + } +} + +static struct pass *finalize_pass(pl_dispatch dp, pl_shader sh, + pl_tex target, int vert_idx, + const struct pl_blend_params *blend, bool load, + const struct pl_dispatch_vertex_params *vparams, + const pl_transform2x2 *proj) +{ + struct pass *pass = pl_alloc_ptr(dp, pass); + *pass = (struct pass) { + .signature = 0x0, // updated incrementally below + .last_index = dp->current_index, + .ubo_desc = { + .desc = { + .name = sh_ident_pack(sh_fresh(sh, "UBO")), + .type = PL_DESC_BUF_UNIFORM, + }, + }, + }; + + // For identifiers tied to the lifetime of this shader + void *tmp = sh->tmp; + + struct pl_pass_params params = { + .type = pl_shader_is_compute(sh) ? PL_PASS_COMPUTE : PL_PASS_RASTER, + .num_descriptors = sh->descs.num, + .vertex_type = vparams ? vparams->vertex_type : PL_PRIM_TRIANGLE_STRIP, + .vertex_stride = vparams ? vparams->vertex_stride : 0, + .blend_params = blend, + }; + + struct generate_params gen_params = { + .tmp = tmp, + .pass = pass, + .pass_params = ¶ms, + .sh = sh, + .vert_idx = vert_idx, + }; + + if (params.type == PL_PASS_RASTER) { + assert(target); + params.target_format = target->params.format; + params.load_target = load; + + // Fill in the vertex attributes array + params.num_vertex_attribs = sh->vas.num; + params.vertex_attribs = pl_calloc_ptr(tmp, sh->vas.num, params.vertex_attribs); + + int va_loc = 0; + for (int i = 0; i < sh->vas.num; i++) { + struct pl_vertex_attrib *va = ¶ms.vertex_attribs[i]; + *va = sh->vas.elem[i].attr; + + // Mangle the name to make sure it doesn't conflict with the + // fragment shader input, this will be converted back to a legal + // string by the shader compilation code + va->name = sh_ident_pack(sh_fresh(sh, "va")); + + // Place the vertex attribute + va->location = va_loc; + if (!vparams) { + va->offset = params.vertex_stride; + params.vertex_stride += va->fmt->texel_size; + } + + // The number of vertex attribute locations consumed by a vertex + // attribute is the number of vec4s it consumes, rounded up + const size_t va_loc_size = sizeof(float[4]); + va_loc += PL_DIV_UP(va->fmt->texel_size, va_loc_size); + } + + // Hash in the raster state configuration + pl_hash_merge(&pass->signature, (uint64_t) params.vertex_type); + pl_hash_merge(&pass->signature, (uint64_t) params.vertex_stride); + pl_hash_merge(&pass->signature, (uint64_t) params.load_target); + pl_hash_merge(&pass->signature, target->params.format->signature); + if (blend) { + pl_static_assert(sizeof(*blend) == sizeof(enum pl_blend_mode) * 4); + pl_hash_merge(&pass->signature, pl_var_hash(*blend)); + } + + // Load projection matrix if required + if (proj && memcmp(&proj->mat, &pl_matrix2x2_identity, sizeof(proj->mat)) != 0) { + gen_params.out_mat = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat2("proj"), + .data = PL_TRANSPOSE_2X2(proj->mat.m), + }); + } + + if (proj && (proj->c[0] || proj->c[1])) { + gen_params.out_off = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("offset"), + .data = proj->c, + }); + } + } + + // Place all of the compile-time constants + uint8_t *constant_data = NULL; + if (sh->consts.num) { + params.num_constants = sh->consts.num; + params.constants = pl_alloc(tmp, sh->consts.num * sizeof(struct pl_constant)); + + // Compute offsets + size_t total_size = 0; + uint32_t const_id = 0; + for (int i = 0; i < sh->consts.num; i++) { + params.constants[i] = (struct pl_constant) { + .type = sh->consts.elem[i].type, + .id = const_id++, + .offset = total_size, + }; + total_size += pl_var_type_size(sh->consts.elem[i].type); + } + + // Write values into the constants buffer + params.constant_data = constant_data = pl_alloc(pass, total_size); + for (int i = 0; i < sh->consts.num; i++) { + const struct pl_shader_const *sc = &sh->consts.elem[i]; + void *data = constant_data + params.constants[i].offset; + memcpy(data, sc->data, pl_var_type_size(sc->type)); + } + } + + // Place all the variables; these will dynamically end up in different + // locations based on what the underlying GPU supports (UBOs, pushc, etc.) + // + // We go through the list twice, once to place stuff that we definitely + // want inside PCs, and then a second time to opportunistically place the rest. + pass->vars = pl_calloc_ptr(pass, sh->vars.num, pass->vars); + for (int i = 0; i < sh->vars.num; i++) { + if (!add_pass_var(dp, tmp, pass, ¶ms, &sh->vars.elem[i], &pass->vars[i], false)) + goto error; + } + for (int i = 0; i < sh->vars.num; i++) { + if (!add_pass_var(dp, tmp, pass, ¶ms, &sh->vars.elem[i], &pass->vars[i], true)) + goto error; + } + + // Now that we know the variable placement, finalize pushc/UBO sizes + params.push_constants_size = PL_ALIGN2(params.push_constants_size, 4); + size_t ubo_size = sh_buf_desc_size(&pass->ubo_desc); + if (ubo_size) { + pass->ubo_index = sh->descs.num; + PL_ARRAY_APPEND(sh, sh->descs, pass->ubo_desc); // don't mangle names + }; + + // Place and fill in the descriptors + const int num_descs = sh->descs.num; + int binding[PL_DESC_TYPE_COUNT] = {0}; + params.num_descriptors = num_descs; + params.descriptors = pl_calloc_ptr(tmp, num_descs, params.descriptors); + for (int i = 0; i < num_descs; i++) { + struct pl_desc *desc = ¶ms.descriptors[i]; + *desc = sh->descs.elem[i].desc; + desc->binding = binding[pl_desc_namespace(dp->gpu, desc->type)]++; + } + + // Finalize the shader and look it up in the pass cache + pl_str_builder vert_builder = NULL, glsl_builder = NULL; + generate_shaders(dp, &gen_params, &vert_builder, &glsl_builder); + for (int i = 0; i < dp->passes.num; i++) { + struct pass *p = dp->passes.elem[i]; + if (p->signature != pass->signature) + continue; + + // Found existing shader, re-use directly + if (p->ubo) + sh->descs.elem[p->ubo_index].binding.object = p->ubo; + pl_free(p->run_params.constant_data); + p->run_params.constant_data = pl_steal(p, constant_data); + p->last_index = dp->current_index; + pl_free(pass); + return p; + } + + // Need to compile new shader, execute templates now + if (vert_builder) { + pl_str vert = pl_str_builder_exec(vert_builder); + params.vertex_shader = (char *) vert.buf; + } + pl_str glsl = pl_str_builder_exec(glsl_builder); + params.glsl_shader = (char *) glsl.buf; + + // Turn all shader identifiers into actual strings before passing it + // to the `pl_gpu` +#define FIX_IDENT(name) \ + name = sh_ident_tostr(sh_ident_unpack(name)) + for (int i = 0; i < params.num_variables; i++) + FIX_IDENT(params.variables[i].name); + for (int i = 0; i < params.num_descriptors; i++) + FIX_IDENT(params.descriptors[i].name); + for (int i = 0; i < params.num_vertex_attribs; i++) + FIX_IDENT(params.vertex_attribs[i].name); +#undef FIX_IDENT + + pass->pass = pl_pass_create(dp->gpu, ¶ms); + if (!pass->pass) { + PL_ERR(dp, "Failed creating render pass for dispatch"); + // Add it anyway + } + + struct pl_pass_run_params *rparams = &pass->run_params; + rparams->pass = pass->pass; + rparams->constant_data = constant_data; + rparams->push_constants = pl_zalloc(pass, params.push_constants_size); + rparams->desc_bindings = pl_calloc_ptr(pass, params.num_descriptors, + rparams->desc_bindings); + + if (ubo_size && pass->pass) { + // Create the UBO + pass->ubo = pl_buf_create(dp->gpu, pl_buf_params( + .size = ubo_size, + .uniform = true, + .host_writable = true, + )); + + if (!pass->ubo) { + PL_ERR(dp, "Failed creating uniform buffer for dispatch"); + goto error; + } + + sh->descs.elem[pass->ubo_index].binding.object = pass->ubo; + } + + if (params.type == PL_PASS_RASTER && !vparams) { + // Generate the vertex array placeholder + rparams->vertex_count = 4; // single quad + size_t vert_size = rparams->vertex_count * params.vertex_stride; + rparams->vertex_data = pl_zalloc(pass, vert_size); + } + + pass->timer = pl_timer_create(dp->gpu); + + PL_ARRAY_APPEND(dp, dp->passes, pass); + return pass; + +error: + pass_destroy(dp, pass); + return NULL; +} + +static void update_pass_var(pl_dispatch dp, struct pass *pass, + const struct pl_shader_var *sv, struct pass_var *pv) +{ + struct pl_var_layout host_layout = pl_var_host_layout(0, &sv->var); + pl_assert(host_layout.size); + + // Use the cache to skip updates if possible + if (pv->cached_data && !memcmp(sv->data, pv->cached_data, host_layout.size)) + return; + if (!pv->cached_data) + pv->cached_data = pl_alloc(pass, host_layout.size); + memcpy(pv->cached_data, sv->data, host_layout.size); + + struct pl_pass_run_params *rparams = &pass->run_params; + switch (pv->type) { + case PASS_VAR_NONE: + pl_unreachable(); + case PASS_VAR_GLOBAL: { + struct pl_var_update vu = { + .index = pv->index, + .data = sv->data, + }; + PL_ARRAY_APPEND_RAW(pass, rparams->var_updates, rparams->num_var_updates, vu); + break; + } + case PASS_VAR_UBO: { + pl_assert(pass->ubo); + const size_t offset = pv->layout.offset; + if (host_layout.stride == pv->layout.stride) { + pl_assert(host_layout.size == pv->layout.size); + pl_buf_write(dp->gpu, pass->ubo, offset, sv->data, host_layout.size); + } else { + // Coalesce strided UBO write into a single pl_buf_write to avoid + // unnecessary synchronization overhead by assembling the correctly + // strided upload in RAM + pl_grow(dp, &dp->ubo_tmp, pv->layout.size); + uint8_t * const tmp = dp->ubo_tmp; + const uint8_t *src = sv->data; + const uint8_t *end = src + host_layout.size; + uint8_t *dst = tmp; + while (src < end) { + memcpy(dst, src, host_layout.stride); + src += host_layout.stride; + dst += pv->layout.stride; + } + pl_buf_write(dp->gpu, pass->ubo, offset, tmp, pv->layout.size); + } + break; + } + case PASS_VAR_PUSHC: + pl_assert(rparams->push_constants); + memcpy_layout(rparams->push_constants, pv->layout, sv->data, host_layout); + break; + }; +} + +static void compute_vertex_attribs(pl_dispatch dp, pl_shader sh, + int width, int height, ident_t *out_scale) +{ + // Simulate vertex attributes using global definitions + *out_scale = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("out_scale"), + .data = &(float[2]){ 1.0 / width, 1.0 / height }, + .dynamic = true, + }); + + GLSLP("#define frag_pos(id) (vec2(id) + vec2(0.5)) \n" + "#define frag_map(id) ("$" * frag_pos(id)) \n" + "#define gl_FragCoord vec4(frag_pos(gl_GlobalInvocationID), 0.0, 1.0) \n", + *out_scale); + + for (int n = 0; n < sh->vas.num; n++) { + const struct pl_shader_va *sva = &sh->vas.elem[n]; + + ident_t points[4]; + for (int i = 0; i < PL_ARRAY_SIZE(points); i++) { + points[i] = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_from_fmt(sva->attr.fmt, "pt"), + .data = sva->data[i], + }); + } + + GLSLP("#define "$"_map(id) " + "(mix(mix("$", "$", frag_map(id).x), " + " mix("$", "$", frag_map(id).x), " + "frag_map(id).y)) \n" + "#define "$" ("$"_map(gl_GlobalInvocationID)) \n", + sh_ident_unpack(sva->attr.name), + points[0], points[1], points[2], points[3], + sh_ident_unpack(sva->attr.name), + sh_ident_unpack(sva->attr.name)); + } +} + +static void translate_compute_shader(pl_dispatch dp, pl_shader sh, + const pl_rect2d *rc, + const struct pl_dispatch_params *params) +{ + int width = abs(pl_rect_w(*rc)), height = abs(pl_rect_h(*rc)); + if (sh->transpose) + PL_SWAP(width, height); + ident_t out_scale; + compute_vertex_attribs(dp, sh, width, height, &out_scale); + + // Simulate a framebuffer using storage images + pl_assert(params->target->params.storable); + pl_assert(sh->output == PL_SHADER_SIG_COLOR); + ident_t fbo = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->target, + .desc = { + .name = "out_image", + .type = PL_DESC_STORAGE_IMG, + .access = params->blend_params ? PL_DESC_ACCESS_READWRITE + : PL_DESC_ACCESS_WRITEONLY, + }, + }); + + ident_t base = sh_var(sh, (struct pl_shader_var) { + .data = &(int[2]){ rc->x0, rc->y0 }, + .dynamic = true, + .var = { + .name = "base", + .type = PL_VAR_SINT, + .dim_v = 2, + .dim_m = 1, + .dim_a = 1, + }, + }); + + int dx = rc->x0 > rc->x1 ? -1 : 1, dy = rc->y0 > rc->y1 ? -1 : 1; + GLSL("ivec2 dir = ivec2(%d, %d);\n", dx, dy); // hard-code, not worth var + GLSL("ivec2 pos = "$" + dir * ivec2(gl_GlobalInvocationID).%c%c;\n", + base, sh->transpose ? 'y' : 'x', sh->transpose ? 'x' : 'y'); + GLSL("vec2 fpos = "$" * vec2(gl_GlobalInvocationID);\n", out_scale); + GLSL("if (fpos.x < 1.0 && fpos.y < 1.0) {\n"); + if (params->blend_params) { + GLSL("vec4 orig = imageLoad("$", pos);\n", fbo); + + static const char *modes[] = { + [PL_BLEND_ZERO] = "0.0", + [PL_BLEND_ONE] = "1.0", + [PL_BLEND_SRC_ALPHA] = "color.a", + [PL_BLEND_ONE_MINUS_SRC_ALPHA] = "(1.0 - color.a)", + }; + + GLSL("color = vec4(color.rgb * vec3(%s), color.a * %s) \n" + " + vec4(orig.rgb * vec3(%s), orig.a * %s);\n", + modes[params->blend_params->src_rgb], + modes[params->blend_params->src_alpha], + modes[params->blend_params->dst_rgb], + modes[params->blend_params->dst_alpha]); + } + GLSL("imageStore("$", pos, color);\n", fbo); + GLSL("}\n"); + sh->output = PL_SHADER_SIG_NONE; +} + +static void run_pass(pl_dispatch dp, pl_shader sh, struct pass *pass) +{ + pl_shader_info shader = &sh->info->info; + pl_pass_run(dp->gpu, &pass->run_params); + + for (uint64_t ts; (ts = pl_timer_query(dp->gpu, pass->timer));) { + PL_TRACE(dp, "Spent %.3f ms on shader: %s", ts / 1e6, shader->description); + + uint64_t old = pass->samples[pass->ts_idx]; + pass->samples[pass->ts_idx] = ts; + pass->ts_last = ts; + pass->ts_peak = PL_MAX(pass->ts_peak, ts); + pass->ts_sum += ts; + pass->ts_idx = (pass->ts_idx + 1) % PL_ARRAY_SIZE(pass->samples); + + if (old) { + pass->ts_sum -= old; + if (old == pass->ts_peak) { + uint64_t new_peak = 0; + for (int i = 0; i < PL_ARRAY_SIZE(pass->samples); i++) + new_peak = PL_MAX(new_peak, pass->samples[i]); + pass->ts_peak = new_peak; + } + } + } + + if (!dp->info_callback) + return; + + struct pl_dispatch_info info; + info.signature = pass->signature; + info.shader = shader; + + // Test to see if the ring buffer already wrapped around once + if (pass->samples[pass->ts_idx]) { + info.num_samples = PL_ARRAY_SIZE(pass->samples); + int num_wrapped = info.num_samples - pass->ts_idx; + memcpy(info.samples, &pass->samples[pass->ts_idx], + num_wrapped * sizeof(info.samples[0])); + memcpy(&info.samples[num_wrapped], pass->samples, + pass->ts_idx * sizeof(info.samples[0])); + } else { + info.num_samples = pass->ts_idx; + memcpy(info.samples, pass->samples, + pass->ts_idx * sizeof(info.samples[0])); + } + + info.last = pass->ts_last; + info.peak = pass->ts_peak; + info.average = pass->ts_sum / PL_MAX(info.num_samples, 1); + dp->info_callback(dp->info_priv, &info); +} + +bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params) +{ + pl_shader sh = *params->shader; + bool ret = false; + pl_mutex_lock(&dp->lock); + + if (sh->failed) { + PL_ERR(sh, "Trying to dispatch a failed shader."); + goto error; + } + + if (!sh->mutable) { + PL_ERR(dp, "Trying to dispatch non-mutable shader?"); + goto error; + } + + if (sh->input != PL_SHADER_SIG_NONE || sh->output != PL_SHADER_SIG_COLOR) { + PL_ERR(dp, "Trying to dispatch shader with incompatible signature!"); + goto error; + } + + const struct pl_tex_params *tpars = ¶ms->target->params; + if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) { + PL_ERR(dp, "Trying to dispatch a shader using an invalid target " + "texture. The target must be a renderable 2D texture."); + goto error; + } + + const struct pl_gpu_limits *limits = &dp->gpu->limits; + bool can_compute = tpars->storable; + if (can_compute && params->blend_params) + can_compute = tpars->format->caps & PL_FMT_CAP_READWRITE; + + if (pl_shader_is_compute(sh) && !can_compute) { + PL_ERR(dp, "Trying to dispatch using a compute shader with a " + "non-storable or incompatible target texture."); + goto error; + } else if (can_compute && limits->compute_queues > limits->fragment_queues) { + if (sh_try_compute(sh, 16, 16, true, 0)) + PL_TRACE(dp, "Upgrading fragment shader to compute shader."); + } + + pl_rect2d rc = params->rect; + if (!pl_rect_w(rc)) { + rc.x0 = 0; + rc.x1 = tpars->w; + } + if (!pl_rect_h(rc)) { + rc.y0 = 0; + rc.y1 = tpars->h; + } + + int w, h, tw = abs(pl_rect_w(rc)), th = abs(pl_rect_h(rc)); + if (pl_shader_output_size(sh, &w, &h) && (w != tw || h != th)) + { + PL_ERR(dp, "Trying to dispatch a shader with explicit output size " + "requirements %dx%d%s using a target rect of size %dx%d.", + w, h, sh->transpose ? " (transposed)" : "", tw, th); + goto error; + } + + int vert_idx = -1; + const pl_transform2x2 *proj = NULL; + if (pl_shader_is_compute(sh)) { + // Translate the compute shader to simulate vertices etc. + translate_compute_shader(dp, sh, &rc, params); + } else { + // Add the vertex information encoding the position + pl_rect2df vert_rect = { + .x0 = 2.0 * rc.x0 / tpars->w - 1.0, + .y0 = 2.0 * rc.y0 / tpars->h - 1.0, + .x1 = 2.0 * rc.x1 / tpars->w - 1.0, + .y1 = 2.0 * rc.y1 / tpars->h - 1.0, + }; + + if (sh->transpose) { + static const pl_transform2x2 transpose_proj = {{{ + { 0, 1 }, + { 1, 0 }, + }}}; + proj = &transpose_proj; + PL_SWAP(vert_rect.x0, vert_rect.y0); + PL_SWAP(vert_rect.x1, vert_rect.y1); + } + + sh_attr_vec2(sh, "position", &vert_rect); + vert_idx = sh->vas.num - 1; + } + + // We need to set pl_pass_params.load_target when either blending is + // enabled or we're drawing to some scissored sub-rect of the texture + pl_rect2d full = { 0, 0, tpars->w, tpars->h }; + pl_rect2d rc_norm = rc; + pl_rect2d_normalize(&rc_norm); + rc_norm.x0 = PL_MAX(rc_norm.x0, 0); + rc_norm.y0 = PL_MAX(rc_norm.y0, 0); + rc_norm.x1 = PL_MIN(rc_norm.x1, tpars->w); + rc_norm.y1 = PL_MIN(rc_norm.y1, tpars->h); + bool load = params->blend_params || !pl_rect2d_eq(rc_norm, full); + + struct pass *pass = finalize_pass(dp, sh, params->target, vert_idx, + params->blend_params, load, NULL, proj); + + // Silently return on failed passes + if (!pass || !pass->pass) + goto error; + + struct pl_pass_run_params *rparams = &pass->run_params; + + // Update the descriptor bindings + for (int i = 0; i < sh->descs.num; i++) + rparams->desc_bindings[i] = sh->descs.elem[i].binding; + + // Update all of the variables (if needed) + rparams->num_var_updates = 0; + for (int i = 0; i < sh->vars.num; i++) + update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]); + + // Update the vertex data + if (rparams->vertex_data) { + uintptr_t vert_base = (uintptr_t) rparams->vertex_data; + size_t stride = rparams->pass->params.vertex_stride; + for (int i = 0; i < sh->vas.num; i++) { + const struct pl_shader_va *sva = &sh->vas.elem[i]; + struct pl_vertex_attrib *va = &rparams->pass->params.vertex_attribs[i]; + + size_t size = sva->attr.fmt->texel_size; + uintptr_t va_base = vert_base + va->offset; // use placed offset + for (int n = 0; n < 4; n++) + memcpy((void *) (va_base + n * stride), sva->data[n], size); + } + } + + // For compute shaders: also update the dispatch dimensions + if (pl_shader_is_compute(sh)) { + int width = abs(pl_rect_w(rc)), + height = abs(pl_rect_h(rc)); + if (sh->transpose) + PL_SWAP(width, height); + // Round up to make sure we don't leave off a part of the target + int block_w = sh->group_size[0], + block_h = sh->group_size[1], + num_x = PL_DIV_UP(width, block_w), + num_y = PL_DIV_UP(height, block_h); + + rparams->compute_groups[0] = num_x; + rparams->compute_groups[1] = num_y; + rparams->compute_groups[2] = 1; + } else { + // Update the scissors for performance + rparams->scissors = rc_norm; + } + + // Dispatch the actual shader + rparams->target = params->target; + rparams->timer = PL_DEF(params->timer, pass->timer); + run_pass(dp, sh, pass); + + ret = true; + // fall through + +error: + // Reset the temporary buffers which we use to build the shader + for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++) + pl_str_builder_reset(dp->tmp[i]); + + pl_mutex_unlock(&dp->lock); + pl_dispatch_abort(dp, params->shader); + return ret; +} + +bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params) +{ + pl_shader sh = *params->shader; + bool ret = false; + pl_mutex_lock(&dp->lock); + + if (sh->failed) { + PL_ERR(sh, "Trying to dispatch a failed shader."); + goto error; + } + + if (!sh->mutable) { + PL_ERR(dp, "Trying to dispatch non-mutable shader?"); + goto error; + } + + if (sh->input != PL_SHADER_SIG_NONE) { + PL_ERR(dp, "Trying to dispatch shader with incompatible signature!"); + goto error; + } + + if (!pl_shader_is_compute(sh)) { + PL_ERR(dp, "Trying to dispatch a non-compute shader using " + "`pl_dispatch_compute`!"); + goto error; + } + + if (sh->vas.num) { + if (!params->width || !params->height) { + PL_ERR(dp, "Trying to dispatch a targetless compute shader that " + "uses vertex attributes, this requires specifying the size " + "of the effective rendering area!"); + goto error; + } + + compute_vertex_attribs(dp, sh, params->width, params->height, + &(ident_t){0}); + } + + struct pass *pass = finalize_pass(dp, sh, NULL, -1, NULL, false, NULL, NULL); + + // Silently return on failed passes + if (!pass || !pass->pass) + goto error; + + struct pl_pass_run_params *rparams = &pass->run_params; + + // Update the descriptor bindings + for (int i = 0; i < sh->descs.num; i++) + rparams->desc_bindings[i] = sh->descs.elem[i].binding; + + // Update all of the variables (if needed) + rparams->num_var_updates = 0; + for (int i = 0; i < sh->vars.num; i++) + update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]); + + // Update the dispatch size + int groups = 1; + for (int i = 0; i < 3; i++) { + groups *= params->dispatch_size[i]; + rparams->compute_groups[i] = params->dispatch_size[i]; + } + + if (!groups) { + pl_assert(params->width && params->height); + int block_w = sh->group_size[0], + block_h = sh->group_size[1], + num_x = PL_DIV_UP(params->width, block_w), + num_y = PL_DIV_UP(params->height, block_h); + + rparams->compute_groups[0] = num_x; + rparams->compute_groups[1] = num_y; + rparams->compute_groups[2] = 1; + } + + // Dispatch the actual shader + rparams->timer = PL_DEF(params->timer, pass->timer); + run_pass(dp, sh, pass); + + ret = true; + // fall through + +error: + // Reset the temporary buffers which we use to build the shader + for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++) + pl_str_builder_reset(dp->tmp[i]); + + pl_mutex_unlock(&dp->lock); + pl_dispatch_abort(dp, params->shader); + return ret; +} + +bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params) +{ + pl_shader sh = *params->shader; + bool ret = false; + pl_mutex_lock(&dp->lock); + + if (sh->failed) { + PL_ERR(sh, "Trying to dispatch a failed shader."); + goto error; + } + + if (!sh->mutable) { + PL_ERR(dp, "Trying to dispatch non-mutable shader?"); + goto error; + } + + if (sh->input != PL_SHADER_SIG_NONE || sh->output != PL_SHADER_SIG_COLOR) { + PL_ERR(dp, "Trying to dispatch shader with incompatible signature!"); + goto error; + } + + const struct pl_tex_params *tpars = ¶ms->target->params; + if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) { + PL_ERR(dp, "Trying to dispatch a shader using an invalid target " + "texture. The target must be a renderable 2D texture."); + goto error; + } + + if (pl_shader_is_compute(sh)) { + PL_ERR(dp, "Trying to dispatch a compute shader using pl_dispatch_vertex."); + goto error; + } + + if (sh->vas.num) { + PL_ERR(dp, "Trying to dispatch a custom vertex shader with already " + "attached vertex attributes."); + goto error; + } + + if (sh->transpose) { + PL_ERR(dp, "Trying to dispatch a transposed shader using " + "pl_dispatch_vertex, unlikely to be correct. Erroring as a " + "safety precaution!"); + goto error; + } + + int pos_idx = params->vertex_position_idx; + if (pos_idx < 0 || pos_idx >= params->num_vertex_attribs) { + PL_ERR(dp, "Vertex position index out of range?"); + goto error; + } + + // Attach all of the vertex attributes to the shader manually + sh->vas.num = params->num_vertex_attribs; + PL_ARRAY_RESIZE(sh, sh->vas, sh->vas.num); + for (int i = 0; i < params->num_vertex_attribs; i++) { + ident_t id = sh_fresh(sh, params->vertex_attribs[i].name); + sh->vas.elem[i].attr = params->vertex_attribs[i]; + sh->vas.elem[i].attr.name = sh_ident_pack(id); + GLSLP("#define %s "$"\n", params->vertex_attribs[i].name, id); + } + + // Compute the coordinate projection matrix + pl_transform2x2 proj = pl_transform2x2_identity; + switch (params->vertex_coords) { + case PL_COORDS_ABSOLUTE: + proj.mat.m[0][0] /= tpars->w; + proj.mat.m[1][1] /= tpars->h; + // fall through + case PL_COORDS_RELATIVE: + proj.mat.m[0][0] *= 2.0; + proj.mat.m[1][1] *= 2.0; + proj.c[0] -= 1.0; + proj.c[1] -= 1.0; + // fall through + case PL_COORDS_NORMALIZED: + if (params->vertex_flipped) { + proj.mat.m[1][1] = -proj.mat.m[1][1]; + proj.c[1] += 2.0; + } + break; + } + + struct pass *pass = finalize_pass(dp, sh, params->target, pos_idx, + params->blend_params, true, params, &proj); + + // Silently return on failed passes + if (!pass || !pass->pass) + goto error; + + struct pl_pass_run_params *rparams = &pass->run_params; + + // Update the descriptor bindings + for (int i = 0; i < sh->descs.num; i++) + rparams->desc_bindings[i] = sh->descs.elem[i].binding; + + // Update all of the variables (if needed) + rparams->num_var_updates = 0; + for (int i = 0; i < sh->vars.num; i++) + update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]); + + // Update the scissors + rparams->scissors = params->scissors; + if (params->vertex_flipped) { + rparams->scissors.y0 = tpars->h - rparams->scissors.y0; + rparams->scissors.y1 = tpars->h - rparams->scissors.y1; + } + pl_rect2d_normalize(&rparams->scissors); + + // Dispatch the actual shader + rparams->target = params->target; + rparams->vertex_count = params->vertex_count; + rparams->vertex_data = params->vertex_data; + rparams->vertex_buf = params->vertex_buf; + rparams->buf_offset = params->buf_offset; + rparams->index_data = params->index_data; + rparams->index_fmt = params->index_fmt; + rparams->index_buf = params->index_buf; + rparams->index_offset = params->index_offset; + rparams->timer = PL_DEF(params->timer, pass->timer); + run_pass(dp, sh, pass); + + ret = true; + // fall through + +error: + // Reset the temporary buffers which we use to build the shader + for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++) + pl_str_builder_reset(dp->tmp[i]); + + pl_mutex_unlock(&dp->lock); + pl_dispatch_abort(dp, params->shader); + return ret; +} + +void pl_dispatch_abort(pl_dispatch dp, pl_shader *psh) +{ + pl_shader sh = *psh; + if (!sh) + return; + + // Free unused memory as early as possible + sh_deref(sh); + + // Re-add the shader to the internal pool of shaders + pl_mutex_lock(&dp->lock); + PL_ARRAY_APPEND(dp, dp->shaders, sh); + pl_mutex_unlock(&dp->lock); + *psh = NULL; +} + +void pl_dispatch_reset_frame(pl_dispatch dp) +{ + pl_mutex_lock(&dp->lock); + + dp->current_ident = 0; + dp->current_index++; + garbage_collect_passes(dp); + + pl_mutex_unlock(&dp->lock); +} + +size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out) +{ + return pl_cache_save(pl_gpu_cache(dp->gpu), out, out ? SIZE_MAX : 0); +} + +void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache) +{ + pl_cache_load(pl_gpu_cache(dp->gpu), cache, SIZE_MAX); +} diff --git a/src/dispatch.h b/src/dispatch.h new file mode 100644 index 0000000..66c10f6 --- /dev/null +++ b/src/dispatch.h @@ -0,0 +1,31 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +// Like `pl_dispatch_begin`, but has an extra `unique` parameter. If this is +// true, the generated shader will be uniquely namespaced `unique` and may be +// freely merged with other shaders (`sh_subpass`). Otherwise, all shaders have +// the same namespace and merging them is an error. +pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique); + +// Set the `dynamic_constants` field for newly created `pl_shader` objects. +// +// This is a private API because it's sort of clunky/stateful. +void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic); diff --git a/src/dither.c b/src/dither.c new file mode 100644 index 0000000..13f68e4 --- /dev/null +++ b/src/dither.c @@ -0,0 +1,317 @@ +/* + * Generate a noise texture for dithering images. + * Copyright © 2013 Wessel Dankers <wsl@fruit.je> + * + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + * + * The original code is taken from mpv, under the same license. + */ + + +#include <stdint.h> +#include <stdbool.h> +#include <stdlib.h> +#include <inttypes.h> +#include <string.h> +#include <assert.h> +#include <math.h> + +#include "common.h" + +#include <libplacebo/dither.h> + +void pl_generate_bayer_matrix(float *data, int size) +{ + pl_assert(size >= 0); + + // Start with a single entry of 0 + data[0] = 0; + + for (int sz = 1; sz < size; sz *= 2) { + // Make three copies of the current, appropriately shifted and scaled + for (int y = 0; y < sz; y ++) { + for (int x = 0; x < sz; x++) { + int offsets[] = {0, sz * size + sz, sz, sz * size}; + int pos = y * size + x; + + for (int i = 1; i < 4; i++) + data[pos + offsets[i]] = data[pos] + i / (4.0 * sz * sz); + } + } + } +} + +#define MAX_SIZEB 8 +#define MAX_SIZE (1 << MAX_SIZEB) +#define MAX_SIZE2 (MAX_SIZE * MAX_SIZE) + +typedef uint_fast32_t index_t; + +#define WRAP_SIZE2(k, x) ((index_t)((index_t)(x) & ((k)->size2 - 1))) +#define XY(k, x, y) ((index_t)(((x) | ((y) << (k)->sizeb)))) + +struct ctx { + unsigned int sizeb, size, size2; + unsigned int gauss_radius; + unsigned int gauss_middle; + uint64_t gauss[MAX_SIZE2]; + index_t randomat[MAX_SIZE2]; + bool calcmat[MAX_SIZE2]; + uint64_t gaussmat[MAX_SIZE2]; + index_t unimat[MAX_SIZE2]; +}; + +static void makegauss(struct ctx *k, unsigned int sizeb) +{ + pl_assert(sizeb >= 1 && sizeb <= MAX_SIZEB); + + k->sizeb = sizeb; + k->size = 1 << k->sizeb; + k->size2 = k->size * k->size; + + k->gauss_radius = k->size / 2 - 1; + k->gauss_middle = XY(k, k->gauss_radius, k->gauss_radius); + + unsigned int gauss_size = k->gauss_radius * 2 + 1; + unsigned int gauss_size2 = gauss_size * gauss_size; + + for (index_t c = 0; c < k->size2; c++) + k->gauss[c] = 0; + + double sigma = -log(1.5 / (double) UINT64_MAX * gauss_size2) / k->gauss_radius; + + for (index_t gy = 0; gy <= k->gauss_radius; gy++) { + for (index_t gx = 0; gx <= gy; gx++) { + int cx = (int)gx - k->gauss_radius; + int cy = (int)gy - k->gauss_radius; + int sq = cx * cx + cy * cy; + double e = exp(-sqrt(sq) * sigma); + uint64_t v = e / gauss_size2 * (double) UINT64_MAX; + k->gauss[XY(k, gx, gy)] = + k->gauss[XY(k, gy, gx)] = + k->gauss[XY(k, gx, gauss_size - 1 - gy)] = + k->gauss[XY(k, gy, gauss_size - 1 - gx)] = + k->gauss[XY(k, gauss_size - 1 - gx, gy)] = + k->gauss[XY(k, gauss_size - 1 - gy, gx)] = + k->gauss[XY(k, gauss_size - 1 - gx, gauss_size - 1 - gy)] = + k->gauss[XY(k, gauss_size - 1 - gy, gauss_size - 1 - gx)] = v; + } + } + +#ifndef NDEBUG + uint64_t total = 0; + for (index_t c = 0; c < k->size2; c++) { + uint64_t oldtotal = total; + total += k->gauss[c]; + assert(total >= oldtotal); + } +#endif +} + +static void setbit(struct ctx *k, index_t c) +{ + if (k->calcmat[c]) + return; + k->calcmat[c] = true; + uint64_t *m = k->gaussmat; + uint64_t *me = k->gaussmat + k->size2; + uint64_t *g = k->gauss + WRAP_SIZE2(k, k->gauss_middle + k->size2 - c); + uint64_t *ge = k->gauss + k->size2; + while (g < ge) + *m++ += *g++; + g = k->gauss; + while (m < me) + *m++ += *g++; +} + +static index_t getmin(struct ctx *k) +{ + uint64_t min = UINT64_MAX; + index_t resnum = 0; + unsigned int size2 = k->size2; + for (index_t c = 0; c < size2; c++) { + if (k->calcmat[c]) + continue; + uint64_t total = k->gaussmat[c]; + if (total <= min) { + if (total != min) { + min = total; + resnum = 0; + } + k->randomat[resnum++] = c; + } + } + assert(resnum > 0); + if (resnum == 1) + return k->randomat[0]; + if (resnum == size2) + return size2 / 2; + return k->randomat[rand() % resnum]; +} + +static void makeuniform(struct ctx *k) +{ + unsigned int size2 = k->size2; + for (index_t c = 0; c < size2; c++) { + index_t r = getmin(k); + setbit(k, r); + k->unimat[r] = c; + } +} + +void pl_generate_blue_noise(float *data, int size) +{ + pl_assert(size > 0); + int shift = PL_LOG2(size); + + pl_assert((1 << shift) == size); + struct ctx *k = pl_zalloc_ptr(NULL, k); + makegauss(k, shift); + makeuniform(k); + float invscale = k->size2; + for(index_t y = 0; y < k->size; y++) { + for(index_t x = 0; x < k->size; x++) + data[x + y * k->size] = k->unimat[XY(k, x, y)] / invscale; + } + pl_free(k); +} + +const struct pl_error_diffusion_kernel pl_error_diffusion_simple = { + .name = "simple", + .description = "Simple error diffusion", + .shift = 1, + .pattern = {{0, 0, 0, 1, 0}, + {0, 0, 1, 0, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 2, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_false_fs = { + .name = "false-fs", + .description = "False Floyd-Steinberg kernel", + .shift = 1, + .pattern = {{0, 0, 0, 3, 0}, + {0, 0, 3, 2, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 8, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_sierra_lite = { + .name = "sierra-lite", + .description = "Sierra Lite kernel", + .shift = 2, + .pattern = {{0, 0, 0, 2, 0}, + {0, 1, 1, 0, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 4, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_floyd_steinberg = { + .name = "floyd-steinberg", + .description = "Floyd Steinberg kernel", + .shift = 2, + .pattern = {{0, 0, 0, 7, 0}, + {0, 3, 5, 1, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 16, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_atkinson = { + .name = "atkinson", + .description = "Atkinson kernel", + .shift = 2, + .pattern = {{0, 0, 0, 1, 1}, + {0, 1, 1, 1, 0}, + {0, 0, 1, 0, 0}}, + .divisor = 8, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_jarvis_judice_ninke = { + .name = "jarvis-judice-ninke", + .description = "Jarvis, Judice & Ninke kernel", + .shift = 3, + .pattern = {{0, 0, 0, 7, 5}, + {3, 5, 7, 5, 3}, + {1, 3, 5, 3, 1}}, + .divisor = 48, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_stucki = { + .name = "stucki", + .description = "Stucki kernel", + .shift = 3, + .pattern = {{0, 0, 0, 8, 4}, + {2, 4, 8, 4, 2}, + {1, 2, 4, 2, 1}}, + .divisor = 42, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_burkes = { + .name = "burkes", + .description = "Burkes kernel", + .shift = 3, + .pattern = {{0, 0, 0, 8, 4}, + {2, 4, 8, 4, 2}, + {0, 0, 0, 0, 0}}, + .divisor = 32, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_sierra2 = { + .name = "sierra-2", + .description = "Two-row Sierra", + .shift = 3, + .pattern = {{0, 0, 0, 4, 3}, + {1, 2, 3, 2, 1}, + {0, 0, 0, 0, 0}}, + .divisor = 16, +}; + +const struct pl_error_diffusion_kernel pl_error_diffusion_sierra3 = { + .name = "sierra-3", + .description = "Three-row Sierra", + .shift = 3, + .pattern = {{0, 0, 0, 5, 3}, + {2, 4, 5, 4, 2}, + {0, 2, 3, 2, 0}}, + .divisor = 32, +}; + +const struct pl_error_diffusion_kernel * const pl_error_diffusion_kernels[] = { + &pl_error_diffusion_simple, + &pl_error_diffusion_false_fs, + &pl_error_diffusion_sierra_lite, + &pl_error_diffusion_floyd_steinberg, + &pl_error_diffusion_atkinson, + &pl_error_diffusion_jarvis_judice_ninke, + &pl_error_diffusion_stucki, + &pl_error_diffusion_burkes, + &pl_error_diffusion_sierra2, + &pl_error_diffusion_sierra3, + NULL +}; + +const int pl_num_error_diffusion_kernels = PL_ARRAY_SIZE(pl_error_diffusion_kernels) - 1; + +// Find the error diffusion kernel with the given name, or NULL on failure. +const struct pl_error_diffusion_kernel *pl_find_error_diffusion_kernel(const char *name) +{ + for (int i = 0; i < pl_num_error_diffusion_kernels; i++) { + if (strcmp(name, pl_error_diffusion_kernels[i]->name) == 0) + return pl_error_diffusion_kernels[i]; + } + + return NULL; +} diff --git a/src/dummy.c b/src/dummy.c new file mode 100644 index 0000000..cd80080 --- /dev/null +++ b/src/dummy.c @@ -0,0 +1,348 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <limits.h> +#include <string.h> + +#include "gpu.h" + +#include <libplacebo/dummy.h> + +const struct pl_gpu_dummy_params pl_gpu_dummy_default_params = { PL_GPU_DUMMY_DEFAULTS }; +static const struct pl_gpu_fns pl_fns_dummy; + +struct priv { + struct pl_gpu_fns impl; + struct pl_gpu_dummy_params params; +}; + +pl_gpu pl_gpu_dummy_create(pl_log log, const struct pl_gpu_dummy_params *params) +{ + params = PL_DEF(params, &pl_gpu_dummy_default_params); + + struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct priv); + gpu->log = log; + gpu->glsl = params->glsl; + gpu->limits = params->limits; + + struct priv *p = PL_PRIV(gpu); + p->impl = pl_fns_dummy; + p->params = *params; + + // Forcibly override these, because we know for sure what the values are + gpu->limits.align_tex_xfer_pitch = 1; + gpu->limits.align_tex_xfer_offset = 1; + gpu->limits.align_vertex_stride = 1; + + // Set up the dummy formats, add one for each possible format type that we + // can represent on the host + PL_ARRAY(pl_fmt) formats = {0}; + for (enum pl_fmt_type type = 1; type < PL_FMT_TYPE_COUNT; type++) { + for (int comps = 1; comps <= 4; comps++) { + for (int depth = 8; depth < 128; depth *= 2) { + if (type == PL_FMT_FLOAT && depth < 16) + continue; + + static const char *cnames[] = { + [1] = "r", + [2] = "rg", + [3] = "rgb", + [4] = "rgba", + }; + + static const char *tnames[] = { + [PL_FMT_UNORM] = "", + [PL_FMT_SNORM] = "s", + [PL_FMT_UINT] = "u", + [PL_FMT_SINT] = "i", + [PL_FMT_FLOAT] = "f", + }; + + const char *tname = tnames[type]; + if (type == PL_FMT_FLOAT && depth == 16) + tname = "hf"; + + struct pl_fmt_t *fmt = pl_alloc_ptr(gpu, fmt); + *fmt = (struct pl_fmt_t) { + .name = pl_asprintf(fmt, "%s%d%s", cnames[comps], depth, tname), + .type = type, + .num_components = comps, + .opaque = false, + .gatherable = true, + .internal_size = comps * depth / 8, + .texel_size = comps * depth / 8, + .texel_align = 1, + .caps = PL_FMT_CAP_SAMPLEABLE | PL_FMT_CAP_LINEAR | + PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLENDABLE | + PL_FMT_CAP_VERTEX | PL_FMT_CAP_HOST_READABLE, + }; + + for (int i = 0; i < comps; i++) { + fmt->component_depth[i] = depth; + fmt->host_bits[i] = depth; + fmt->sample_order[i] = i; + } + + if (gpu->glsl.compute) + fmt->caps |= PL_FMT_CAP_STORABLE; + if (gpu->limits.max_buffer_texels && gpu->limits.max_ubo_size) + fmt->caps |= PL_FMT_CAP_TEXEL_UNIFORM; + if (gpu->limits.max_buffer_texels && gpu->limits.max_ssbo_size) + fmt->caps |= PL_FMT_CAP_TEXEL_STORAGE; + + fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, "")); + fmt->glsl_format = pl_fmt_glsl_format(fmt, comps); + fmt->fourcc = pl_fmt_fourcc(fmt); + if (!fmt->glsl_format) + fmt->caps &= ~(PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE); + PL_ARRAY_APPEND(gpu, formats, fmt); + } + } + } + + gpu->formats = formats.elem; + gpu->num_formats = formats.num; + return pl_gpu_finalize(gpu); +} + +static void dumb_destroy(pl_gpu gpu) +{ + pl_free((void *) gpu); +} + +void pl_gpu_dummy_destroy(pl_gpu *gpu) +{ + pl_gpu_destroy(*gpu); + *gpu = NULL; +} + +struct buf_priv { + uint8_t *data; +}; + +static pl_buf dumb_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct buf_priv); + buf->params = *params; + buf->params.initial_data = NULL; + + struct buf_priv *p = PL_PRIV(buf); + p->data = malloc(params->size); + if (!p->data) { + PL_ERR(gpu, "Failed allocating memory for dummy buffer!"); + pl_free(buf); + return NULL; + } + + if (params->initial_data) + memcpy(p->data, params->initial_data, params->size); + if (params->host_mapped) + buf->data = p->data; + + return buf; +} + +static void dumb_buf_destroy(pl_gpu gpu, pl_buf buf) +{ + struct buf_priv *p = PL_PRIV(buf); + free(p->data); + pl_free((void *) buf); +} + +uint8_t *pl_buf_dummy_data(pl_buf buf) +{ + struct buf_priv *p = PL_PRIV(buf); + return p->data; +} + +static void dumb_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset, + const void *data, size_t size) +{ + struct buf_priv *p = PL_PRIV(buf); + memcpy(p->data + buf_offset, data, size); +} + +static bool dumb_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset, + void *dest, size_t size) +{ + struct buf_priv *p = PL_PRIV(buf); + memcpy(dest, p->data + buf_offset, size); + return true; +} + +static void dumb_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size) +{ + struct buf_priv *dstp = PL_PRIV(dst); + struct buf_priv *srcp = PL_PRIV(src); + memcpy(dstp->data + dst_offset, srcp->data + src_offset, size); +} + +struct tex_priv { + void *data; +}; + +static size_t tex_size(pl_gpu gpu, pl_tex tex) +{ + size_t size = tex->params.format->texel_size * tex->params.w; + size *= PL_DEF(tex->params.h, 1); + size *= PL_DEF(tex->params.d, 1); + return size; +} + +static pl_tex dumb_tex_create(pl_gpu gpu, const struct pl_tex_params *params) +{ + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, void *); + tex->params = *params; + tex->params.initial_data = NULL; + + struct tex_priv *p = PL_PRIV(tex); + p->data = malloc(tex_size(gpu, tex)); + if (!p->data) { + PL_ERR(gpu, "Failed allocating memory for dummy texture!"); + pl_free(tex); + return NULL; + } + + if (params->initial_data) + memcpy(p->data, params->initial_data, tex_size(gpu, tex)); + + return tex; +} + +pl_tex pl_tex_dummy_create(pl_gpu gpu, const struct pl_tex_dummy_params *params) +{ + // Only do minimal sanity checking, since this is just a dummy texture + pl_assert(params->format && params->w >= 0 && params->h >= 0 && params->d >= 0); + + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct tex_priv); + tex->sampler_type = params->sampler_type; + tex->params = (struct pl_tex_params) { + .w = params->w, + .h = params->h, + .d = params->d, + .format = params->format, + .sampleable = true, + .user_data = params->user_data, + }; + + return tex; +} + +static void dumb_tex_destroy(pl_gpu gpu, pl_tex tex) +{ + struct tex_priv *p = PL_PRIV(tex); + if (p->data) + free(p->data); + pl_free((void *) tex); +} + +uint8_t *pl_tex_dummy_data(pl_tex tex) +{ + struct tex_priv *p = PL_PRIV(tex); + return p->data; +} + +static bool dumb_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + struct tex_priv *p = PL_PRIV(tex); + pl_assert(p->data); + + const uint8_t *src = params->ptr; + uint8_t *dst = p->data; + if (params->buf) { + struct buf_priv *bufp = PL_PRIV(params->buf); + src = (uint8_t *) bufp->data + params->buf_offset; + } + + size_t texel_size = tex->params.format->texel_size; + size_t row_size = pl_rect_w(params->rc) * texel_size; + for (int z = params->rc.z0; z < params->rc.z1; z++) { + size_t src_plane = z * params->depth_pitch; + size_t dst_plane = z * tex->params.h * tex->params.w * texel_size; + for (int y = params->rc.y0; y < params->rc.y1; y++) { + size_t src_row = src_plane + y * params->row_pitch; + size_t dst_row = dst_plane + y * tex->params.w * texel_size; + size_t pos = params->rc.x0 * texel_size; + memcpy(&dst[dst_row + pos], &src[src_row + pos], row_size); + } + } + + return true; +} + +static bool dumb_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + struct tex_priv *p = PL_PRIV(tex); + pl_assert(p->data); + + const uint8_t *src = p->data; + uint8_t *dst = params->ptr; + if (params->buf) { + struct buf_priv *bufp = PL_PRIV(params->buf); + dst = (uint8_t *) bufp->data + params->buf_offset; + } + + size_t texel_size = tex->params.format->texel_size; + size_t row_size = pl_rect_w(params->rc) * texel_size; + for (int z = params->rc.z0; z < params->rc.z1; z++) { + size_t src_plane = z * tex->params.h * tex->params.w * texel_size; + size_t dst_plane = z * params->depth_pitch; + for (int y = params->rc.y0; y < params->rc.y1; y++) { + size_t src_row = src_plane + y * tex->params.w * texel_size; + size_t dst_row = dst_plane + y * params->row_pitch; + size_t pos = params->rc.x0 * texel_size; + memcpy(&dst[dst_row + pos], &src[src_row + pos], row_size); + } + } + + return true; +} + +static int dumb_desc_namespace(pl_gpu gpu, enum pl_desc_type type) +{ + return 0; // safest behavior: never alias bindings +} + +static pl_pass dumb_pass_create(pl_gpu gpu, const struct pl_pass_params *params) +{ + PL_ERR(gpu, "Creating render passes is not supported for dummy GPUs"); + return NULL; +} + +static void dumb_gpu_finish(pl_gpu gpu) +{ + // no-op +} + +static const struct pl_gpu_fns pl_fns_dummy = { + .destroy = dumb_destroy, + .buf_create = dumb_buf_create, + .buf_destroy = dumb_buf_destroy, + .buf_write = dumb_buf_write, + .buf_read = dumb_buf_read, + .buf_copy = dumb_buf_copy, + .tex_create = dumb_tex_create, + .tex_destroy = dumb_tex_destroy, + .tex_upload = dumb_tex_upload, + .tex_download = dumb_tex_download, + .desc_namespace = dumb_desc_namespace, + .pass_create = dumb_pass_create, + .gpu_finish = dumb_gpu_finish, +}; diff --git a/src/filters.c b/src/filters.c new file mode 100644 index 0000000..cc4871f --- /dev/null +++ b/src/filters.c @@ -0,0 +1,1015 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Some of the filter code originally derives (via mpv) from Glumpy: + * # Copyright (c) 2009-2016 Nicolas P. Rougier. All rights reserved. + * # Distributed under the (new) BSD License. + * (https://github.com/glumpy/glumpy/blob/master/glumpy/library/build-spatial-filters.py) + * + * The math underlying each filter function was written from scratch, with + * some algorithms coming from a number of different sources, including: + * - https://en.wikipedia.org/wiki/Window_function + * - https://en.wikipedia.org/wiki/Jinc + * - http://vector-agg.cvs.sourceforge.net/viewvc/vector-agg/agg-2.5/include/agg_image_filters.h + * - Vapoursynth plugin fmtconv (WTFPL Licensed), which is based on + * dither plugin for avisynth from the same author: + * https://github.com/vapoursynth/fmtconv/tree/master/src/fmtc + * - Paul Heckbert's "zoom" + * - XBMC: ConvolutionKernels.cpp etc. + * - https://github.com/AviSynth/jinc-resize (only used to verify the math) + */ + +#include <math.h> + +#include "common.h" +#include "filters.h" +#include "log.h" + +#ifdef PL_HAVE_WIN32 +#define j1 _j1 +#endif + +bool pl_filter_function_eq(const struct pl_filter_function *a, + const struct pl_filter_function *b) +{ + return (a ? a->weight : NULL) == (b ? b->weight : NULL); +} + +bool pl_filter_config_eq(const struct pl_filter_config *a, + const struct pl_filter_config *b) +{ + if (!a || !b) + return a == b; + + bool eq = pl_filter_function_eq(a->kernel, b->kernel) && + pl_filter_function_eq(a->window, b->window) && + a->radius == b->radius && + a->clamp == b->clamp && + a->blur == b->blur && + a->taper == b->taper && + a->polar == b->polar && + a->antiring == b->antiring; + + for (int i = 0; i < PL_FILTER_MAX_PARAMS; i++) { + if (a->kernel->tunable[i]) + eq &= a->params[i] == b->params[i]; + if (a->window && a->window->tunable[i]) + eq &= a->wparams[i] == b->wparams[i]; + } + + return eq; +} + +double pl_filter_sample(const struct pl_filter_config *c, double x) +{ + const float radius = pl_filter_radius_bound(c); + + // All filters are symmetric, and in particular only need to be defined + // for [0, radius]. + x = fabs(x); + + // Return early for values outside of the kernel radius, since the functions + // are not necessarily valid outside of this interval. No such check is + // needed for the window, because it's always stretched to fit. + if (x > radius) + return 0.0; + + // Apply the blur and taper coefficients as needed + double kx = x <= c->taper ? 0.0 : (x - c->taper) / (1.0 - c->taper / radius); + if (c->blur > 0.0) + kx /= c->blur; + + pl_assert(!c->kernel->opaque); + double k = c->kernel->weight(&(const struct pl_filter_ctx) { + .radius = radius, + .params = { + c->kernel->tunable[0] ? c->params[0] : c->kernel->params[0], + c->kernel->tunable[1] ? c->params[1] : c->kernel->params[1], + }, + }, kx); + + // Apply the optional windowing function + if (c->window) { + pl_assert(!c->window->opaque); + double wx = x / radius * c->window->radius; + k *= c->window->weight(&(struct pl_filter_ctx) { + .radius = c->window->radius, + .params = { + c->window->tunable[0] ? c->wparams[0] : c->window->params[0], + c->window->tunable[1] ? c->wparams[1] : c->window->params[1], + }, + }, wx); + } + + return k < 0 ? (1 - c->clamp) * k : k; +} + +static void filter_cutoffs(const struct pl_filter_config *c, float cutoff, + float *out_radius, float *out_radius_zero) +{ + const float bound = pl_filter_radius_bound(c); + float prev = 0.0, fprev = pl_filter_sample(c, prev); + bool found_root = false; + + const float step = 1e-2f; + for (float x = 0.0; x < bound + step; x += step) { + float fx = pl_filter_sample(c, x); + if ((fprev > cutoff && fx <= cutoff) || (fprev < -cutoff && fx >= -cutoff)) { + // Found zero crossing + float root = x - fx * (x - prev) / (fx - fprev); // secant method + root = fminf(root, bound); + *out_radius = root; + if (!found_root) // first root + *out_radius_zero = root; + found_root = true; + } + prev = x; + fprev = fx; + } + + if (!found_root) + *out_radius_zero = *out_radius = bound; +} + +// Compute a single row of weights for a given filter in one dimension, indexed +// by the indicated subpixel offset. Writes `f->row_size` values to `out`. +static void compute_row(struct pl_filter_t *f, double offset, float *out) +{ + double wsum = 0.0; + for (int i = 0; i < f->row_size; i++) { + // For the example of a filter with row size 4 and offset 0.3, we have: + // + // 0 1 * 2 3 + // + // * indicates the sampled position. What we want to compute is the + // distance from each index to that sampled position. + pl_assert(f->row_size % 2 == 0); + const int base = f->row_size / 2 - 1; // index to the left of the center + const double center = base + offset; // offset of center relative to idx 0 + double w = pl_filter_sample(&f->params.config, i - center); + out[i] = w; + wsum += w; + } + + // Readjust weights to preserve energy + pl_assert(wsum > 0); + for (int i = 0; i < f->row_size; i++) + out[i] /= wsum; +} + +// Needed for backwards compatibility with v1 configuration API +static struct pl_filter_function *dupfilter(void *alloc, + const struct pl_filter_function *f) +{ + return f ? pl_memdup(alloc, (void *)f, sizeof(*f)) : NULL; +} + +pl_filter pl_filter_generate(pl_log log, const struct pl_filter_params *params) +{ + pl_assert(params); + if (params->lut_entries <= 0 || !params->config.kernel) { + pl_fatal(log, "Invalid params: missing lut_entries or config.kernel"); + return NULL; + } + + if (params->config.kernel->opaque) { + pl_err(log, "Trying to use opaque kernel '%s' in non-opaque context!", + params->config.kernel->name); + return NULL; + } + + if (params->config.window && params->config.window->opaque) { + pl_err(log, "Trying to use opaque window '%s' in non-opaque context!", + params->config.window->name); + return NULL; + } + + struct pl_filter_t *f = pl_zalloc_ptr(NULL, f); + f->params = *params; + f->params.config.kernel = dupfilter(f, params->config.kernel); + f->params.config.window = dupfilter(f, params->config.window); + + // Compute main lobe and total filter size + filter_cutoffs(¶ms->config, params->cutoff, &f->radius, &f->radius_zero); + f->radius_cutoff = f->radius; // backwards compatibility + + float *weights; + if (params->config.polar) { + // Compute a 1D array indexed by radius + weights = pl_alloc(f, params->lut_entries * sizeof(float)); + for (int i = 0; i < params->lut_entries; i++) { + double x = f->radius * i / (params->lut_entries - 1); + weights[i] = pl_filter_sample(¶ms->config, x); + } + } else { + // Pick the most appropriate row size + f->row_size = ceilf(f->radius) * 2; + if (params->max_row_size && f->row_size > params->max_row_size) { + pl_info(log, "Required filter size %d exceeds the maximum allowed " + "size of %d. This may result in adverse effects (aliasing, " + "or moiré artifacts).", f->row_size, params->max_row_size); + f->row_size = params->max_row_size; + f->insufficient = true; + } + f->row_stride = PL_ALIGN(f->row_size, params->row_stride_align); + + // Compute a 2D array indexed by the subpixel position + weights = pl_calloc(f, params->lut_entries * f->row_stride, sizeof(float)); + for (int i = 0; i < params->lut_entries; i++) { + compute_row(f, i / (double)(params->lut_entries - 1), + weights + f->row_stride * i); + } + } + + f->weights = weights; + return f; +} + +void pl_filter_free(pl_filter *filter) +{ + pl_free_ptr((void **) filter); +} + +// Built-in filter functions + +static double box(const struct pl_filter_ctx *f, double x) +{ + return 1.0; +} + +const struct pl_filter_function pl_filter_function_box = { + .weight = box, + .name = "box", + .radius = 1.0, + .resizable = true, +}; + +static const struct pl_filter_function filter_function_dirichlet = { + .name = "dirichlet", // alias + .weight = box, + .radius = 1.0, + .resizable = true, +}; + +static double triangle(const struct pl_filter_ctx *f, double x) +{ + return 1.0 - x / f->radius; +} + +const struct pl_filter_function pl_filter_function_triangle = { + .name = "triangle", + .weight = triangle, + .radius = 1.0, + .resizable = true, +}; + +static double cosine(const struct pl_filter_ctx *f, double x) +{ + return cos(x); +} + +const struct pl_filter_function pl_filter_function_cosine = { + .name = "cosine", + .weight = cosine, + .radius = M_PI / 2.0, +}; + +static double hann(const struct pl_filter_ctx *f, double x) +{ + return 0.5 + 0.5 * cos(M_PI * x); +} + +const struct pl_filter_function pl_filter_function_hann = { + .name = "hann", + .weight = hann, + .radius = 1.0, +}; + +static const struct pl_filter_function filter_function_hanning = { + .name = "hanning", // alias + .weight = hann, + .radius = 1.0, +}; + +static double hamming(const struct pl_filter_ctx *f, double x) +{ + return 0.54 + 0.46 * cos(M_PI * x); +} + +const struct pl_filter_function pl_filter_function_hamming = { + .name = "hamming", + .weight = hamming, + .radius = 1.0, +}; + +static double welch(const struct pl_filter_ctx *f, double x) +{ + return 1.0 - x * x; +} + +const struct pl_filter_function pl_filter_function_welch = { + .name = "welch", + .weight = welch, + .radius = 1.0, +}; + +static double bessel_i0(double x) +{ + double s = 1.0; + double y = x * x / 4.0; + double t = y; + int i = 2; + while (t > 1e-12) { + s += t; + t *= y / (i * i); + i += 1; + } + return s; +} + +static double kaiser(const struct pl_filter_ctx *f, double x) +{ + double alpha = fmax(f->params[0], 0.0); + double scale = bessel_i0(alpha); + return bessel_i0(alpha * sqrt(1.0 - x * x)) / scale; +} + +const struct pl_filter_function pl_filter_function_kaiser = { + .name = "kaiser", + .weight = kaiser, + .radius = 1.0, + .params = {2.0}, + .tunable = {true}, +}; + +static double blackman(const struct pl_filter_ctx *f, double x) +{ + double a = f->params[0]; + double a0 = (1 - a) / 2.0, a1 = 1 / 2.0, a2 = a / 2.0; + x *= M_PI; + return a0 + a1 * cos(x) + a2 * cos(2 * x); +} + +const struct pl_filter_function pl_filter_function_blackman = { + .name = "blackman", + .weight = blackman, + .radius = 1.0, + .params = {0.16}, + .tunable = {true}, +}; + +static double bohman(const struct pl_filter_ctx *f, double x) +{ + double pix = M_PI * x; + return (1.0 - x) * cos(pix) + sin(pix) / M_PI; +} + +const struct pl_filter_function pl_filter_function_bohman = { + .name = "bohman", + .weight = bohman, + .radius = 1.0, +}; + +static double gaussian(const struct pl_filter_ctx *f, double x) +{ + return exp(-2.0 * x * x / f->params[0]); +} + +const struct pl_filter_function pl_filter_function_gaussian = { + .name = "gaussian", + .weight = gaussian, + .radius = 2.0, + .resizable = true, + .params = {1.0}, + .tunable = {true}, +}; + +static double quadratic(const struct pl_filter_ctx *f, double x) +{ + if (x < 0.5) { + return 1.0 - 4.0/3.0 * (x * x); + } else { + return 2.0 / 3.0 * (x - 1.5) * (x - 1.5); + } +} + +const struct pl_filter_function pl_filter_function_quadratic = { + .name = "quadratic", + .weight = quadratic, + .radius = 1.5, +}; + +static const struct pl_filter_function filter_function_quadric = { + .name = "quadric", // alias + .weight = quadratic, + .radius = 1.5, +}; + +static double sinc(const struct pl_filter_ctx *f, double x) +{ + if (x < 1e-8) + return 1.0; + x *= M_PI; + return sin(x) / x; +} + +const struct pl_filter_function pl_filter_function_sinc = { + .name = "sinc", + .weight = sinc, + .radius = 1.0, + .resizable = true, +}; + +static double jinc(const struct pl_filter_ctx *f, double x) +{ + if (x < 1e-8) + return 1.0; + x *= M_PI; + return 2.0 * j1(x) / x; +} + +const struct pl_filter_function pl_filter_function_jinc = { + .name = "jinc", + .weight = jinc, + .radius = 1.2196698912665045, // first zero + .resizable = true, +}; + +static double sphinx(const struct pl_filter_ctx *f, double x) +{ + if (x < 1e-8) + return 1.0; + x *= M_PI; + return 3.0 * (sin(x) - x * cos(x)) / (x * x * x); +} + +const struct pl_filter_function pl_filter_function_sphinx = { + .name = "sphinx", + .weight = sphinx, + .radius = 1.4302966531242027, // first zero + .resizable = true, +}; + +static double cubic(const struct pl_filter_ctx *f, double x) +{ + const double b = f->params[0], c = f->params[1]; + double p0 = 6.0 - 2.0 * b, + p2 = -18.0 + 12.0 * b + 6.0 * c, + p3 = 12.0 - 9.0 * b - 6.0 * c, + q0 = 8.0 * b + 24.0 * c, + q1 = -12.0 * b - 48.0 * c, + q2 = 6.0 * b + 30.0 * c, + q3 = -b - 6.0 * c; + + if (x < 1.0) { + return (p0 + x * x * (p2 + x * p3)) / p0; + } else { + return (q0 + x * (q1 + x * (q2 + x * q3))) / p0; + } +} + +const struct pl_filter_function pl_filter_function_cubic = { + .name = "cubic", + .weight = cubic, + .radius = 2.0, + .params = {1.0, 0.0}, + .tunable = {true, true}, +}; + +static const struct pl_filter_function filter_function_bicubic = { + .name = "bicubic", // alias + .weight = cubic, + .radius = 2.0, + .params = {1.0, 0.0}, + .tunable = {true, true}, +}; + +static const struct pl_filter_function filter_function_bcspline = { + .name = "bcspline", // alias + .weight = cubic, + .radius = 2.0, + .params = {1.0, 0.0}, + .tunable = {true, true}, +}; + +const struct pl_filter_function pl_filter_function_hermite = { + .name = "hermite", + .weight = cubic, + .radius = 1.0, + .params = {0.0, 0.0}, +}; + +static double spline16(const struct pl_filter_ctx *f, double x) +{ + if (x < 1.0) { + return ((x - 9.0/5.0 ) * x - 1.0/5.0 ) * x + 1.0; + } else { + return ((-1.0/3.0 * (x-1) + 4.0/5.0) * (x-1) - 7.0/15.0 ) * (x-1); + } +} + +const struct pl_filter_function pl_filter_function_spline16 = { + .name = "spline16", + .weight = spline16, + .radius = 2.0, +}; + +static double spline36(const struct pl_filter_ctx *f, double x) +{ + if (x < 1.0) { + return ((13.0/11.0 * x - 453.0/209.0) * x - 3.0/209.0) * x + 1.0; + } else if (x < 2.0) { + return ((-6.0/11.0 * (x-1) + 270.0/209.0) * (x-1) - 156.0/ 209.0) * (x-1); + } else { + return ((1.0/11.0 * (x-2) - 45.0/209.0) * (x-2) + 26.0/209.0) * (x-2); + } +} + +const struct pl_filter_function pl_filter_function_spline36 = { + .name = "spline36", + .weight = spline36, + .radius = 3.0, +}; + +static double spline64(const struct pl_filter_ctx *f, double x) +{ + if (x < 1.0) { + return ((49.0/41.0 * x - 6387.0/2911.0) * x - 3.0/2911.0) * x + 1.0; + } else if (x < 2.0) { + return ((-24.0/41.0 * (x-1) + 4032.0/2911.0) * (x-1) - 2328.0/2911.0) * (x-1); + } else if (x < 3.0) { + return ((6.0/41.0 * (x-2) - 1008.0/2911.0) * (x-2) + 582.0/2911.0) * (x-2); + } else { + return ((-1.0/41.0 * (x-3) + 168.0/2911.0) * (x-3) - 97.0/2911.0) * (x-3); + } +} + +const struct pl_filter_function pl_filter_function_spline64 = { + .name = "spline64", + .weight = spline64, + .radius = 4.0, +}; + +static double oversample(const struct pl_filter_ctx *f, double x) +{ + return 0.0; +} + +const struct pl_filter_function pl_filter_function_oversample = { + .name = "oversample", + .weight = oversample, + .params = {0.0}, + .tunable = {true}, + .opaque = true, +}; + +const struct pl_filter_function * const pl_filter_functions[] = { + &pl_filter_function_box, + &filter_function_dirichlet, // alias + &pl_filter_function_triangle, + &pl_filter_function_cosine, + &pl_filter_function_hann, + &filter_function_hanning, // alias + &pl_filter_function_hamming, + &pl_filter_function_welch, + &pl_filter_function_kaiser, + &pl_filter_function_blackman, + &pl_filter_function_bohman, + &pl_filter_function_gaussian, + &pl_filter_function_quadratic, + &filter_function_quadric, // alias + &pl_filter_function_sinc, + &pl_filter_function_jinc, + &pl_filter_function_sphinx, + &pl_filter_function_cubic, + &filter_function_bicubic, // alias + &filter_function_bcspline, // alias + &pl_filter_function_hermite, + &pl_filter_function_spline16, + &pl_filter_function_spline36, + &pl_filter_function_spline64, + &pl_filter_function_oversample, + NULL, +}; + +const int pl_num_filter_functions = PL_ARRAY_SIZE(pl_filter_functions) - 1; + +const struct pl_filter_function *pl_find_filter_function(const char *name) +{ + if (!name) + return NULL; + + for (int i = 0; i < pl_num_filter_functions; i++) { + if (strcmp(name, pl_filter_functions[i]->name) == 0) + return pl_filter_functions[i]; + } + + return NULL; +} + +// Built-in filter function configs + +const struct pl_filter_config pl_filter_spline16 = { + .name = "spline16", + .description = "Spline (2 taps)", + .kernel = &pl_filter_function_spline16, + .allowed = PL_FILTER_ALL, +}; + +const struct pl_filter_config pl_filter_spline36 = { + .name = "spline36", + .description = "Spline (3 taps)", + .kernel = &pl_filter_function_spline36, + .allowed = PL_FILTER_ALL, +}; + +const struct pl_filter_config pl_filter_spline64 = { + .name = "spline64", + .description = "Spline (4 taps)", + .kernel = &pl_filter_function_spline64, + .allowed = PL_FILTER_ALL, +}; + +const struct pl_filter_config pl_filter_nearest = { + .name = "nearest", + .description = "Nearest neighbor", + .kernel = &pl_filter_function_box, + .radius = 0.5, + .allowed = PL_FILTER_UPSCALING, + .recommended = PL_FILTER_UPSCALING, +}; + +const struct pl_filter_config pl_filter_box = { + .name = "box", + .description = "Box averaging", + .kernel = &pl_filter_function_box, + .radius = 0.5, + .allowed = PL_FILTER_SCALING, + .recommended = PL_FILTER_DOWNSCALING, +}; + +const struct pl_filter_config pl_filter_bilinear = { + .name = "bilinear", + .description = "Bilinear", + .kernel = &pl_filter_function_triangle, + .allowed = PL_FILTER_ALL, + .recommended = PL_FILTER_SCALING, +}; + +const struct pl_filter_config filter_linear = { + .name = "linear", + .description = "Linear mixing", + .kernel = &pl_filter_function_triangle, + .allowed = PL_FILTER_FRAME_MIXING, + .recommended = PL_FILTER_FRAME_MIXING, +}; + +static const struct pl_filter_config filter_triangle = { + .name = "triangle", + .kernel = &pl_filter_function_triangle, + .allowed = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_gaussian = { + .name = "gaussian", + .description = "Gaussian", + .kernel = &pl_filter_function_gaussian, + .params = {1.0}, + .allowed = PL_FILTER_ALL, + .recommended = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_sinc = { + .name = "sinc", + .description = "Sinc (unwindowed)", + .kernel = &pl_filter_function_sinc, + .radius = 3.0, + .allowed = PL_FILTER_ALL, +}; + +const struct pl_filter_config pl_filter_lanczos = { + .name = "lanczos", + .description = "Lanczos", + .kernel = &pl_filter_function_sinc, + .window = &pl_filter_function_sinc, + .radius = 3.0, + .allowed = PL_FILTER_ALL, + .recommended = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_ginseng = { + .name = "ginseng", + .description = "Ginseng (Jinc-Sinc)", + .kernel = &pl_filter_function_sinc, + .window = &pl_filter_function_jinc, + .radius = 3.0, + .allowed = PL_FILTER_ALL, +}; + +#define JINC_ZERO3 3.2383154841662362076499 +#define JINC_ZERO4 4.2410628637960698819573 + +const struct pl_filter_config pl_filter_ewa_jinc = { + .name = "ewa_jinc", + .description = "EWA Jinc (unwindowed)", + .kernel = &pl_filter_function_jinc, + .radius = JINC_ZERO3, + .polar = true, + .allowed = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_ewa_lanczos = { + .name = "ewa_lanczos", + .description = "Jinc (EWA Lanczos)", + .kernel = &pl_filter_function_jinc, + .window = &pl_filter_function_jinc, + .radius = JINC_ZERO3, + .polar = true, + .allowed = PL_FILTER_SCALING, + .recommended = PL_FILTER_UPSCALING, +}; + +const struct pl_filter_config pl_filter_ewa_lanczossharp = { + .name = "ewa_lanczossharp", + .description = "Sharpened Jinc", + .kernel = &pl_filter_function_jinc, + .window = &pl_filter_function_jinc, + .radius = JINC_ZERO3, + .blur = 0.98125058372237073562493, + .polar = true, + .allowed = PL_FILTER_SCALING, + .recommended = PL_FILTER_UPSCALING, +}; + +const struct pl_filter_config pl_filter_ewa_lanczos4sharpest = { + .name = "ewa_lanczos4sharpest", + .description = "Sharpened Jinc-AR, 4 taps", + .kernel = &pl_filter_function_jinc, + .window = &pl_filter_function_jinc, + .radius = JINC_ZERO4, + .blur = 0.88451209326050047745788, + .antiring = 0.8, + .polar = true, + .allowed = PL_FILTER_SCALING, + .recommended = PL_FILTER_UPSCALING, +}; + +const struct pl_filter_config pl_filter_ewa_ginseng = { + .name = "ewa_ginseng", + .description = "EWA Ginseng", + .kernel = &pl_filter_function_jinc, + .window = &pl_filter_function_sinc, + .radius = JINC_ZERO3, + .polar = true, + .allowed = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_ewa_hann = { + .name = "ewa_hann", + .description = "EWA Hann", + .kernel = &pl_filter_function_jinc, + .window = &pl_filter_function_hann, + .radius = JINC_ZERO3, + .polar = true, + .allowed = PL_FILTER_SCALING, +}; + +static const struct pl_filter_config filter_ewa_hanning = { + .name = "ewa_hanning", + .kernel = &pl_filter_function_jinc, + .window = &pl_filter_function_hann, + .radius = JINC_ZERO3, + .polar = true, + .allowed = PL_FILTER_SCALING, +}; + +// Spline family +const struct pl_filter_config pl_filter_bicubic = { + .name = "bicubic", + .description = "Bicubic", + .kernel = &pl_filter_function_cubic, + .params = {1.0, 0.0}, + .allowed = PL_FILTER_SCALING, + .recommended = PL_FILTER_SCALING, +}; + +static const struct pl_filter_config filter_cubic = { + .name = "cubic", + .description = "Cubic", + .kernel = &pl_filter_function_cubic, + .params = {1.0, 0.0}, + .allowed = PL_FILTER_FRAME_MIXING, +}; + +const struct pl_filter_config pl_filter_hermite = { + .name = "hermite", + .description = "Hermite", + .kernel = &pl_filter_function_hermite, + .allowed = PL_FILTER_ALL, + .recommended = PL_FILTER_DOWNSCALING | PL_FILTER_FRAME_MIXING, +}; + +const struct pl_filter_config pl_filter_catmull_rom = { + .name = "catmull_rom", + .description = "Catmull-Rom", + .kernel = &pl_filter_function_cubic, + .params = {0.0, 0.5}, + .allowed = PL_FILTER_ALL, + .recommended = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_mitchell = { + .name = "mitchell", + .description = "Mitchell-Netravali", + .kernel = &pl_filter_function_cubic, + .params = {1/3.0, 1/3.0}, + .allowed = PL_FILTER_ALL, + .recommended = PL_FILTER_DOWNSCALING, +}; + +const struct pl_filter_config pl_filter_mitchell_clamp = { + .name = "mitchell_clamp", + .description = "Mitchell (clamped)", + .kernel = &pl_filter_function_cubic, + .params = {1/3.0, 1/3.0}, + .clamp = 1.0, + .allowed = PL_FILTER_ALL, +}; + +const struct pl_filter_config pl_filter_robidoux = { + .name = "robidoux", + .description = "Robidoux", + .kernel = &pl_filter_function_cubic, + .params = {12 / (19 + 9 * M_SQRT2), 113 / (58 + 216 * M_SQRT2)}, + .allowed = PL_FILTER_ALL, +}; + +const struct pl_filter_config pl_filter_robidouxsharp = { + .name = "robidouxsharp", + .description = "RobidouxSharp", + .kernel = &pl_filter_function_cubic, + .params = {6 / (13 + 7 * M_SQRT2), 7 / (2 + 12 * M_SQRT2)}, + .allowed = PL_FILTER_ALL, +}; + +const struct pl_filter_config pl_filter_ewa_robidoux = { + .name = "ewa_robidoux", + .description = "EWA Robidoux", + .kernel = &pl_filter_function_cubic, + .params = {12 / (19 + 9 * M_SQRT2), 113 / (58 + 216 * M_SQRT2)}, + .polar = true, + .allowed = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_ewa_robidouxsharp = { + .name = "ewa_robidouxsharp", + .description = "EWA RobidouxSharp", + .kernel = &pl_filter_function_cubic, + .params = {6 / (13 + 7 * M_SQRT2), 7 / (2 + 12 * M_SQRT2)}, + .polar = true, + .allowed = PL_FILTER_SCALING, +}; + +const struct pl_filter_config pl_filter_oversample = { + .name = "oversample", + .description = "Oversampling", + .kernel = &pl_filter_function_oversample, + .params = {0.0}, + .allowed = PL_FILTER_UPSCALING | PL_FILTER_FRAME_MIXING, + .recommended = PL_FILTER_UPSCALING | PL_FILTER_FRAME_MIXING, +}; + +const struct pl_filter_config * const pl_filter_configs[] = { + // Sorted roughly in terms of priority / relevance + &pl_filter_bilinear, + &filter_triangle, // alias + &filter_linear, // pseudo-alias (frame mixing only) + &pl_filter_nearest, + &pl_filter_spline16, + &pl_filter_spline36, + &pl_filter_spline64, + &pl_filter_lanczos, + &pl_filter_ewa_lanczos, + &pl_filter_ewa_lanczossharp, + &pl_filter_ewa_lanczos4sharpest, + &pl_filter_bicubic, + &filter_cubic, // pseudo-alias (frame mixing only) + &pl_filter_hermite, + &pl_filter_gaussian, + &pl_filter_oversample, + &pl_filter_mitchell, + &pl_filter_mitchell_clamp, + &pl_filter_sinc, + &pl_filter_ginseng, + &pl_filter_ewa_jinc, + &pl_filter_ewa_ginseng, + &pl_filter_ewa_hann, + &filter_ewa_hanning, // alias + &pl_filter_catmull_rom, + &pl_filter_robidoux, + &pl_filter_robidouxsharp, + &pl_filter_ewa_robidoux, + &pl_filter_ewa_robidouxsharp, + + NULL, +}; + +const int pl_num_filter_configs = PL_ARRAY_SIZE(pl_filter_configs) - 1; + +const struct pl_filter_config * +pl_find_filter_config(const char *name, enum pl_filter_usage usage) +{ + if (!name) + return NULL; + + for (int i = 0; i < pl_num_filter_configs; i++) { + if ((pl_filter_configs[i]->allowed & usage) != usage) + continue; + if (strcmp(name, pl_filter_configs[i]->name) == 0) + return pl_filter_configs[i]; + } + + return NULL; +} + +// Backwards compatibility with older API + +const struct pl_filter_function_preset pl_filter_function_presets[] = { + {"none", NULL}, + {"box", &pl_filter_function_box}, + {"dirichlet", &filter_function_dirichlet}, // alias + {"triangle", &pl_filter_function_triangle}, + {"cosine", &pl_filter_function_cosine}, + {"hann", &pl_filter_function_hann}, + {"hanning", &filter_function_hanning}, // alias + {"hamming", &pl_filter_function_hamming}, + {"welch", &pl_filter_function_welch}, + {"kaiser", &pl_filter_function_kaiser}, + {"blackman", &pl_filter_function_blackman}, + {"bohman", &pl_filter_function_bohman}, + {"gaussian", &pl_filter_function_gaussian}, + {"quadratic", &pl_filter_function_quadratic}, + {"quadric", &filter_function_quadric}, // alias + {"sinc", &pl_filter_function_sinc}, + {"jinc", &pl_filter_function_jinc}, + {"sphinx", &pl_filter_function_sphinx}, + {"cubic", &pl_filter_function_cubic}, + {"bicubic", &filter_function_bicubic}, // alias + {"bcspline", &filter_function_bcspline}, // alias + {"hermite", &pl_filter_function_hermite}, + {"spline16", &pl_filter_function_spline16}, + {"spline36", &pl_filter_function_spline36}, + {"spline64", &pl_filter_function_spline64}, + {0}, +}; + +const int pl_num_filter_function_presets = PL_ARRAY_SIZE(pl_filter_function_presets) - 1; + +const struct pl_filter_function_preset *pl_find_filter_function_preset(const char *name) +{ + if (!name) + return NULL; + + for (int i = 0; pl_filter_function_presets[i].name; i++) { + if (strcmp(pl_filter_function_presets[i].name, name) == 0) + return &pl_filter_function_presets[i]; + } + + return NULL; +} + +const struct pl_filter_preset *pl_find_filter_preset(const char *name) +{ + if (!name) + return NULL; + + for (int i = 0; pl_filter_presets[i].name; i++) { + if (strcmp(pl_filter_presets[i].name, name) == 0) + return &pl_filter_presets[i]; + } + + return NULL; +} + +const struct pl_filter_preset pl_filter_presets[] = { + {"none", NULL, "Built-in sampling"}, + COMMON_FILTER_PRESETS, + {0} +}; + +const int pl_num_filter_presets = PL_ARRAY_SIZE(pl_filter_presets) - 1; diff --git a/src/filters.h b/src/filters.h new file mode 100644 index 0000000..c3227db --- /dev/null +++ b/src/filters.h @@ -0,0 +1,58 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <libplacebo/filters.h> + +static inline float pl_filter_radius_bound(const struct pl_filter_config *c) +{ + const float r = c->radius && c->kernel->resizable ? c->radius : c->kernel->radius; + return c->blur > 0.0 ? r * c->blur : r; +} + +#define COMMON_FILTER_PRESETS \ + /* Highest priority / recommended filters */ \ + {"bilinear", &pl_filter_bilinear, "Bilinear"}, \ + {"nearest", &pl_filter_nearest, "Nearest neighbour"}, \ + {"bicubic", &pl_filter_bicubic, "Bicubic"}, \ + {"lanczos", &pl_filter_lanczos, "Lanczos"}, \ + {"ewa_lanczos", &pl_filter_ewa_lanczos, "Jinc (EWA Lanczos)"}, \ + {"ewa_lanczossharp", &pl_filter_ewa_lanczossharp, "Sharpened Jinc"}, \ + {"ewa_lanczos4sharpest",&pl_filter_ewa_lanczos4sharpest, "Sharpened Jinc-AR, 4 taps"},\ + {"gaussian", &pl_filter_gaussian, "Gaussian"}, \ + {"spline16", &pl_filter_spline16, "Spline (2 taps)"}, \ + {"spline36", &pl_filter_spline36, "Spline (3 taps)"}, \ + {"spline64", &pl_filter_spline64, "Spline (4 taps)"}, \ + {"mitchell", &pl_filter_mitchell, "Mitchell-Netravali"}, \ + \ + /* Remaining filters */ \ + {"sinc", &pl_filter_sinc, "Sinc (unwindowed)"}, \ + {"ginseng", &pl_filter_ginseng, "Ginseng (Jinc-Sinc)"}, \ + {"ewa_jinc", &pl_filter_ewa_jinc, "EWA Jinc (unwindowed)"}, \ + {"ewa_ginseng", &pl_filter_ewa_ginseng, "EWA Ginseng"}, \ + {"ewa_hann", &pl_filter_ewa_hann, "EWA Hann"}, \ + {"hermite", &pl_filter_hermite, "Hermite"}, \ + {"catmull_rom", &pl_filter_catmull_rom, "Catmull-Rom"}, \ + {"robidoux", &pl_filter_robidoux, "Robidoux"}, \ + {"robidouxsharp", &pl_filter_robidouxsharp, "RobidouxSharp"}, \ + {"ewa_robidoux", &pl_filter_ewa_robidoux, "EWA Robidoux"}, \ + {"ewa_robidouxsharp", &pl_filter_ewa_robidouxsharp, "EWA RobidouxSharp"}, \ + \ + /* Aliases */ \ + {"triangle", &pl_filter_bilinear}, \ + {"ewa_hanning", &pl_filter_ewa_hann} diff --git a/src/format.c b/src/format.c new file mode 100644 index 0000000..458d493 --- /dev/null +++ b/src/format.c @@ -0,0 +1,205 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" + +void pl_str_append_asprintf_c(void *alloc, pl_str *str, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + pl_str_append_vasprintf_c(alloc, str, fmt, ap); + va_end(ap); +} + +void pl_str_append_vasprintf_c(void *alloc, pl_str *str, const char *fmt, + va_list ap) +{ + for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) { + // Append the preceding string literal + pl_str_append_raw(alloc, str, fmt, c - fmt); + c++; // skip '%' + + char buf[32]; + int len; + + // The format character follows the % sign + switch (c[0]) { + case '%': + pl_str_append_raw(alloc, str, c, 1); + continue; + case 's': { + const char *arg = va_arg(ap, const char *); + pl_str_append_raw(alloc, str, arg, strlen(arg)); + continue; + } + case '.': { // only used for %.*s + assert(c[1] == '*'); + assert(c[2] == 's'); + len = va_arg(ap, int); + pl_str_append_raw(alloc, str, va_arg(ap, char *), len); + c += 2; // skip '*s' + continue; + } + case 'c': + buf[0] = (char) va_arg(ap, int); + len = 1; + break; + case 'd': + len = pl_str_print_int(buf, sizeof(buf), va_arg(ap, int)); + break; + case 'h': ; // only used for %hx + assert(c[1] == 'x'); + len = pl_str_print_hex(buf, sizeof(buf), (unsigned short) va_arg(ap, unsigned int)); + c++; + break; + case 'u': + len = pl_str_print_uint(buf, sizeof(buf), va_arg(ap, unsigned int)); + break; + case 'l': + assert(c[1] == 'l'); + switch (c[2]) { + case 'u': + len = pl_str_print_uint64(buf, sizeof(buf), va_arg(ap, unsigned long long)); + break; + case 'd': + len = pl_str_print_int64(buf, sizeof(buf), va_arg(ap, long long)); + break; + default: pl_unreachable(); + } + c += 2; + break; + case 'z': + assert(c[1] == 'u'); + len = pl_str_print_uint64(buf, sizeof(buf), va_arg(ap, size_t)); + c++; + break; + case 'f': + len = pl_str_print_double(buf, sizeof(buf), va_arg(ap, double)); + break; + default: + fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]); + abort(); + } + + pl_str_append_raw(alloc, str, buf, len); + } + + // Append the remaining string literal + pl_str_append(alloc, str, pl_str0(fmt)); +} + +size_t pl_str_append_memprintf_c(void *alloc, pl_str *str, const char *fmt, + const void *args) +{ + const uint8_t *ptr = args; + + for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) { + pl_str_append_raw(alloc, str, fmt, c - fmt); + c++; + + char buf[32]; + int len; + +#define LOAD(var) \ + do { \ + memcpy(&(var), ptr, sizeof(var)); \ + ptr += sizeof(var); \ + } while (0) + + switch (c[0]) { + case '%': + pl_str_append_raw(alloc, str, c, 1); + continue; + case 's': { + len = strlen((const char *) ptr); + pl_str_append_raw(alloc, str, ptr, len); + ptr += len + 1; // also skip \0 + continue; + } + case '.': { + assert(c[1] == '*'); + assert(c[2] == 's'); + LOAD(len); + pl_str_append_raw(alloc, str, ptr, len); + ptr += len; // no trailing \0 + c += 2; + continue; + } + case 'c': + LOAD(buf[0]); + len = 1; + break; + case 'd': ; + int d; + LOAD(d); + len = pl_str_print_int(buf, sizeof(buf), d); + break; + case 'h': ; + assert(c[1] == 'x'); + unsigned short hx; + LOAD(hx); + len = pl_str_print_hex(buf, sizeof(buf), hx); + c++; + break; + case 'u': ; + unsigned u; + LOAD(u); + len = pl_str_print_uint(buf, sizeof(buf), u); + break; + case 'l': + assert(c[1] == 'l'); + switch (c[2]) { + case 'u': ; + long long unsigned llu; + LOAD(llu); + len = pl_str_print_uint64(buf, sizeof(buf), llu); + break; + case 'd': ; + long long int lld; + LOAD(lld); + len = pl_str_print_int64(buf, sizeof(buf), lld); + break; + default: pl_unreachable(); + } + c += 2; + break; + case 'z': ; + assert(c[1] == 'u'); + size_t zu; + LOAD(zu); + len = pl_str_print_uint64(buf, sizeof(buf), zu); + c++; + break; + case 'f': ; + double f; + LOAD(f); + len = pl_str_print_double(buf, sizeof(buf), f); + break; + default: + fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]); + abort(); + } + + pl_str_append_raw(alloc, str, buf, len); + } +#undef LOAD + + pl_str_append(alloc, str, pl_str0(fmt)); + return (uintptr_t) ptr - (uintptr_t) args; +} diff --git a/src/gamut_mapping.c b/src/gamut_mapping.c new file mode 100644 index 0000000..e80d0a7 --- /dev/null +++ b/src/gamut_mapping.c @@ -0,0 +1,1008 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" +#include "pl_thread.h" + +#include <libplacebo/gamut_mapping.h> + +#define fclampf(x, lo, hi) fminf(fmaxf(x, lo), hi) +static void fix_constants(struct pl_gamut_map_constants *c) +{ + c->perceptual_deadzone = fclampf(c->perceptual_deadzone, 0.0f, 1.0f); + c->perceptual_strength = fclampf(c->perceptual_strength, 0.0f, 1.0f); + c->colorimetric_gamma = fclampf(c->colorimetric_gamma, 0.0f, 10.0f); + c->softclip_knee = fclampf(c->softclip_knee, 0.0f, 1.0f); + c->softclip_desat = fclampf(c->softclip_desat, 0.0f, 1.0f); +} + +static inline bool constants_equal(const struct pl_gamut_map_constants *a, + const struct pl_gamut_map_constants *b) +{ + pl_static_assert(sizeof(*a) % sizeof(float) == 0); + return !memcmp(a, b, sizeof(*a)); +} + +bool pl_gamut_map_params_equal(const struct pl_gamut_map_params *a, + const struct pl_gamut_map_params *b) +{ + return a->function == b->function && + a->min_luma == b->min_luma && + a->max_luma == b->max_luma && + a->lut_size_I == b->lut_size_I && + a->lut_size_C == b->lut_size_C && + a->lut_size_h == b->lut_size_h && + a->lut_stride == b->lut_stride && + constants_equal(&a->constants, &b->constants) && + pl_raw_primaries_equal(&a->input_gamut, &b->input_gamut) && + pl_raw_primaries_equal(&a->output_gamut, &b->output_gamut); +} + +#define FUN(params) (params->function ? *params->function : pl_gamut_map_clip) + +static void noop(float *lut, const struct pl_gamut_map_params *params); +bool pl_gamut_map_params_noop(const struct pl_gamut_map_params *params) +{ + if (FUN(params).map == &noop) + return true; + + struct pl_raw_primaries src = params->input_gamut, dst = params->output_gamut; + if (!pl_primaries_compatible(&dst, &src)) + return true; + + bool need_map = !pl_primaries_superset(&dst, &src); + need_map |= !pl_cie_xy_equal(&src.white, &dst.white); + if (FUN(params).bidirectional) + need_map |= !pl_raw_primaries_equal(&dst, &src); + + return !need_map; +} + +// For some minimal type safety, and code cleanliness +struct RGB { + float R, G, B; +}; + +struct IPT { + float I, P, T; +}; + +struct ICh { + float I, C, h; +}; + +static inline struct ICh ipt2ich(struct IPT c) +{ + return (struct ICh) { + .I = c.I, + .C = sqrtf(c.P * c.P + c.T * c.T), + .h = atan2f(c.T, c.P), + }; +} + +static inline struct IPT ich2ipt(struct ICh c) +{ + return (struct IPT) { + .I = c.I, + .P = c.C * cosf(c.h), + .T = c.C * sinf(c.h), + }; +} + +static const float PQ_M1 = 2610./4096 * 1./4, + PQ_M2 = 2523./4096 * 128, + PQ_C1 = 3424./4096, + PQ_C2 = 2413./4096 * 32, + PQ_C3 = 2392./4096 * 32; + +enum { PQ_LUT_SIZE = 1024 }; +static const float pq_eotf_lut[1024+1] = { + 0.0000000e+00f, 4.0422718e-09f, 1.3111372e-08f, 2.6236826e-08f, 4.3151495e-08f, 6.3746885e-08f, 8.7982383e-08f, 1.1585362e-07f, + 1.4737819e-07f, 1.8258818e-07f, 2.2152586e-07f, 2.6424098e-07f, 3.1078907e-07f, 3.6123021e-07f, 4.1562821e-07f, 4.7405001e-07f, + 5.3656521e-07f, 6.0324583e-07f, 6.7416568e-07f, 7.4940095e-07f, 8.2902897e-07f, 9.1312924e-07f, 1.0017822e-06f, 1.0950702e-06f, + 1.1930764e-06f, 1.2958861e-06f, 1.4035847e-06f, 1.5162600e-06f, 1.6340000e-06f, 1.7568948e-06f, 1.8850346e-06f, 2.0185119e-06f, + 2.1574192e-06f, 2.3018509e-06f, 2.4519029e-06f, 2.6076704e-06f, 2.7692516e-06f, 2.9367449e-06f, 3.1102509e-06f, 3.2898690e-06f, + 3.4757019e-06f, 3.6678526e-06f, 3.8664261e-06f, 4.0715262e-06f, 4.2832601e-06f, 4.5017354e-06f, 4.7270617e-06f, 4.9593473e-06f, + 5.1987040e-06f, 5.4452441e-06f, 5.6990819e-06f, 5.9603301e-06f, 6.2291055e-06f, 6.5055251e-06f, 6.7897080e-06f, 7.0817717e-06f, + 7.3818379e-06f, 7.6900283e-06f, 8.0064675e-06f, 8.3312774e-06f, 8.6645849e-06f, 9.0065169e-06f, 9.3572031e-06f, 9.7167704e-06f, + 1.0085351e-05f, 1.0463077e-05f, 1.0850082e-05f, 1.1246501e-05f, 1.1652473e-05f, 1.2068130e-05f, 1.2493614e-05f, 1.2929066e-05f, + 1.3374626e-05f, 1.3830439e-05f, 1.4296648e-05f, 1.4773401e-05f, 1.5260848e-05f, 1.5759132e-05f, 1.6268405e-05f, 1.6788821e-05f, + 1.7320534e-05f, 1.7863697e-05f, 1.8418467e-05f, 1.8985004e-05f, 1.9563470e-05f, 2.0154019e-05f, 2.0756818e-05f, 2.1372031e-05f, + 2.1999824e-05f, 2.2640365e-05f, 2.3293824e-05f, 2.3960372e-05f, 2.4640186e-05f, 2.5333431e-05f, 2.6040288e-05f, 2.6760935e-05f, + 2.7495552e-05f, 2.8244319e-05f, 2.9007421e-05f, 2.9785041e-05f, 3.0577373e-05f, 3.1384594e-05f, 3.2206899e-05f, 3.3044481e-05f, + 3.3897533e-05f, 3.4766253e-05f, 3.5650838e-05f, 3.6551487e-05f, 3.7468409e-05f, 3.8401794e-05f, 3.9351855e-05f, 4.0318799e-05f, + 4.1302836e-05f, 4.2304177e-05f, 4.3323036e-05f, 4.4359629e-05f, 4.5414181e-05f, 4.6486897e-05f, 4.7578006e-05f, 4.8687732e-05f, + 4.9816302e-05f, 5.0963944e-05f, 5.2130889e-05f, 5.3317369e-05f, 5.4523628e-05f, 5.5749886e-05f, 5.6996391e-05f, 5.8263384e-05f, + 5.9551111e-05f, 6.0859816e-05f, 6.2189750e-05f, 6.3541162e-05f, 6.4914307e-05f, 6.6309439e-05f, 6.7726819e-05f, 6.9166705e-05f, + 7.0629384e-05f, 7.2115077e-05f, 7.3624074e-05f, 7.5156646e-05f, 7.6713065e-05f, 7.8293608e-05f, 7.9898553e-05f, 8.1528181e-05f, + 8.3182776e-05f, 8.4862623e-05f, 8.6568012e-05f, 8.8299235e-05f, 9.0056585e-05f, 9.1840360e-05f, 9.3650860e-05f, 9.5488388e-05f, + 9.7353277e-05f, 9.9245779e-05f, 1.0116623e-04f, 1.0311496e-04f, 1.0509226e-04f, 1.0709847e-04f, 1.0913391e-04f, 1.1119889e-04f, + 1.1329376e-04f, 1.1541885e-04f, 1.1757448e-04f, 1.1976100e-04f, 1.2197875e-04f, 1.2422807e-04f, 1.2650931e-04f, 1.2882282e-04f, + 1.3116900e-04f, 1.3354812e-04f, 1.3596059e-04f, 1.3840676e-04f, 1.4088701e-04f, 1.4340170e-04f, 1.4595121e-04f, 1.4853593e-04f, + 1.5115622e-04f, 1.5381247e-04f, 1.5650507e-04f, 1.5923442e-04f, 1.6200090e-04f, 1.6480492e-04f, 1.6764688e-04f, 1.7052718e-04f, + 1.7344629e-04f, 1.7640451e-04f, 1.7940233e-04f, 1.8244015e-04f, 1.8551840e-04f, 1.8863752e-04f, 1.9179792e-04f, 1.9500006e-04f, + 1.9824437e-04f, 2.0153130e-04f, 2.0486129e-04f, 2.0823479e-04f, 2.1165227e-04f, 2.1511419e-04f, 2.1862101e-04f, 2.2217319e-04f, + 2.2577128e-04f, 2.2941563e-04f, 2.3310679e-04f, 2.3684523e-04f, 2.4063146e-04f, 2.4446597e-04f, 2.4834925e-04f, 2.5228182e-04f, + 2.5626417e-04f, 2.6029683e-04f, 2.6438031e-04f, 2.6851514e-04f, 2.7270184e-04f, 2.7694094e-04f, 2.8123299e-04f, 2.8557852e-04f, + 2.8997815e-04f, 2.9443230e-04f, 2.9894159e-04f, 3.0350657e-04f, 3.0812783e-04f, 3.1280593e-04f, 3.1754144e-04f, 3.2233495e-04f, + 3.2718705e-04f, 3.3209833e-04f, 3.3706938e-04f, 3.4210082e-04f, 3.4719324e-04f, 3.5234727e-04f, 3.5756351e-04f, 3.6284261e-04f, + 3.6818526e-04f, 3.7359195e-04f, 3.7906340e-04f, 3.8460024e-04f, 3.9020315e-04f, 3.9587277e-04f, 4.0160977e-04f, 4.0741483e-04f, + 4.1328861e-04f, 4.1923181e-04f, 4.2524511e-04f, 4.3132921e-04f, 4.3748480e-04f, 4.4371260e-04f, 4.5001332e-04f, 4.5638768e-04f, + 4.6283650e-04f, 4.6936032e-04f, 4.7595999e-04f, 4.8263624e-04f, 4.8938982e-04f, 4.9622151e-04f, 5.0313205e-04f, 5.1012223e-04f, + 5.1719283e-04f, 5.2434463e-04f, 5.3157843e-04f, 5.3889502e-04f, 5.4629521e-04f, 5.5377982e-04f, 5.6134968e-04f, 5.6900560e-04f, + 5.7674843e-04f, 5.8457900e-04f, 5.9249818e-04f, 6.0050682e-04f, 6.0860578e-04f, 6.1679595e-04f, 6.2507819e-04f, 6.3345341e-04f, + 6.4192275e-04f, 6.5048661e-04f, 6.5914616e-04f, 6.6790231e-04f, 6.7675600e-04f, 6.8570816e-04f, 6.9475975e-04f, 7.0391171e-04f, + 7.1316500e-04f, 7.2252060e-04f, 7.3197948e-04f, 7.4154264e-04f, 7.5121107e-04f, 7.6098577e-04f, 7.7086777e-04f, 7.8085807e-04f, + 7.9095772e-04f, 8.0116775e-04f, 8.1148922e-04f, 8.2192318e-04f, 8.3247071e-04f, 8.4313287e-04f, 8.5391076e-04f, 8.6480548e-04f, + 8.7581812e-04f, 8.8694982e-04f, 8.9820168e-04f, 9.0957485e-04f, 9.2107048e-04f, 9.3268971e-04f, 9.4443372e-04f, 9.5630368e-04f, + 9.6830115e-04f, 9.8042658e-04f, 9.9268155e-04f, 1.0050673e-03f, 1.0175850e-03f, 1.0302359e-03f, 1.0430213e-03f, 1.0559425e-03f, + 1.0690006e-03f, 1.0821970e-03f, 1.0955331e-03f, 1.1090100e-03f, 1.1226290e-03f, 1.1363917e-03f, 1.1502992e-03f, 1.1643529e-03f, + 1.1785542e-03f, 1.1929044e-03f, 1.2074050e-03f, 1.2220573e-03f, 1.2368628e-03f, 1.2518229e-03f, 1.2669390e-03f, 1.2822125e-03f, + 1.2976449e-03f, 1.3132377e-03f, 1.3289925e-03f, 1.3449105e-03f, 1.3609935e-03f, 1.3772429e-03f, 1.3936602e-03f, 1.4102470e-03f, + 1.4270054e-03f, 1.4439360e-03f, 1.4610407e-03f, 1.4783214e-03f, 1.4957794e-03f, 1.5134166e-03f, 1.5312345e-03f, 1.5492348e-03f, + 1.5674192e-03f, 1.5857894e-03f, 1.6043471e-03f, 1.6230939e-03f, 1.6420317e-03f, 1.6611622e-03f, 1.6804871e-03f, 1.7000083e-03f, + 1.7197275e-03f, 1.7396465e-03f, 1.7597672e-03f, 1.7800914e-03f, 1.8006210e-03f, 1.8213578e-03f, 1.8423038e-03f, 1.8634608e-03f, + 1.8848308e-03f, 1.9064157e-03f, 1.9282175e-03f, 1.9502381e-03f, 1.9724796e-03f, 1.9949439e-03f, 2.0176331e-03f, 2.0405492e-03f, + 2.0636950e-03f, 2.0870711e-03f, 2.1106805e-03f, 2.1345250e-03f, 2.1586071e-03f, 2.1829286e-03f, 2.2074919e-03f, 2.2322992e-03f, + 2.2573525e-03f, 2.2826542e-03f, 2.3082066e-03f, 2.3340118e-03f, 2.3600721e-03f, 2.3863900e-03f, 2.4129676e-03f, 2.4398074e-03f, + 2.4669117e-03f, 2.4942828e-03f, 2.5219233e-03f, 2.5498355e-03f, 2.5780219e-03f, 2.6064849e-03f, 2.6352271e-03f, 2.6642509e-03f, + 2.6935589e-03f, 2.7231536e-03f, 2.7530377e-03f, 2.7832137e-03f, 2.8136843e-03f, 2.8444520e-03f, 2.8755196e-03f, 2.9068898e-03f, + 2.9385662e-03f, 2.9705496e-03f, 3.0028439e-03f, 3.0354517e-03f, 3.0683758e-03f, 3.1016192e-03f, 3.1351846e-03f, 3.1690750e-03f, + 3.2032932e-03f, 3.2378422e-03f, 3.2727250e-03f, 3.3079445e-03f, 3.3435038e-03f, 3.3794058e-03f, 3.4156537e-03f, 3.4522505e-03f, + 3.4891993e-03f, 3.5265034e-03f, 3.5641658e-03f, 3.6021897e-03f, 3.6405785e-03f, 3.6793353e-03f, 3.7184634e-03f, 3.7579661e-03f, + 3.7978468e-03f, 3.8381088e-03f, 3.8787555e-03f, 3.9197904e-03f, 3.9612169e-03f, 4.0030385e-03f, 4.0452587e-03f, 4.0878810e-03f, + 4.1309104e-03f, 4.1743478e-03f, 4.2181981e-03f, 4.2624651e-03f, 4.3071525e-03f, 4.3522639e-03f, 4.3978031e-03f, 4.4437739e-03f, + 4.4901803e-03f, 4.5370259e-03f, 4.5843148e-03f, 4.6320508e-03f, 4.6802379e-03f, 4.7288801e-03f, 4.7779815e-03f, 4.8275461e-03f, + 4.8775780e-03f, 4.9280813e-03f, 4.9790603e-03f, 5.0305191e-03f, 5.0824620e-03f, 5.1348933e-03f, 5.1878172e-03f, 5.2412382e-03f, + 5.2951607e-03f, 5.3495890e-03f, 5.4045276e-03f, 5.4599811e-03f, 5.5159540e-03f, 5.5724510e-03f, 5.6294765e-03f, 5.6870353e-03f, + 5.7451339e-03f, 5.8037735e-03f, 5.8629606e-03f, 5.9227001e-03f, 5.9829968e-03f, 6.0438557e-03f, 6.1052818e-03f, 6.1672799e-03f, + 6.2298552e-03f, 6.2930128e-03f, 6.3567578e-03f, 6.4210953e-03f, 6.4860306e-03f, 6.5515690e-03f, 6.6177157e-03f, 6.6844762e-03f, + 6.7518558e-03f, 6.8198599e-03f, 6.8884942e-03f, 6.9577641e-03f, 7.0276752e-03f, 7.0982332e-03f, 7.1694438e-03f, 7.2413127e-03f, + 7.3138457e-03f, 7.3870486e-03f, 7.4609273e-03f, 7.5354878e-03f, 7.6107361e-03f, 7.6866782e-03f, 7.7633203e-03f, 7.8406684e-03f, + 7.9187312e-03f, 7.9975101e-03f, 8.0770139e-03f, 8.1572490e-03f, 8.2382216e-03f, 8.3199385e-03f, 8.4024059e-03f, 8.4856307e-03f, + 8.5696193e-03f, 8.6543786e-03f, 8.7399153e-03f, 8.8262362e-03f, 8.9133482e-03f, 9.0012582e-03f, 9.0899733e-03f, 9.1795005e-03f, + 9.2698470e-03f, 9.3610199e-03f, 9.4530265e-03f, 9.5458741e-03f, 9.6395701e-03f, 9.7341219e-03f, 9.8295370e-03f, 9.9258231e-03f, + 1.0022988e-02f, 1.0121039e-02f, 1.0219984e-02f, 1.0319830e-02f, 1.0420587e-02f, 1.0522261e-02f, 1.0624862e-02f, 1.0728396e-02f, + 1.0832872e-02f, 1.0938299e-02f, 1.1044684e-02f, 1.1152036e-02f, 1.1260365e-02f, 1.1369677e-02f, 1.1479982e-02f, 1.1591288e-02f, + 1.1703605e-02f, 1.1816941e-02f, 1.1931305e-02f, 1.2046706e-02f, 1.2163153e-02f, 1.2280656e-02f, 1.2399223e-02f, 1.2518864e-02f, + 1.2639596e-02f, 1.2761413e-02f, 1.2884333e-02f, 1.3008365e-02f, 1.3133519e-02f, 1.3259804e-02f, 1.3387231e-02f, 1.3515809e-02f, + 1.3645549e-02f, 1.3776461e-02f, 1.3908555e-02f, 1.4041841e-02f, 1.4176331e-02f, 1.4312034e-02f, 1.4448961e-02f, 1.4587123e-02f, + 1.4726530e-02f, 1.4867194e-02f, 1.5009126e-02f, 1.5152336e-02f, 1.5296837e-02f, 1.5442638e-02f, 1.5589753e-02f, 1.5738191e-02f, + 1.5887965e-02f, 1.6039087e-02f, 1.6191567e-02f, 1.6345419e-02f, 1.6500655e-02f, 1.6657285e-02f, 1.6815323e-02f, 1.6974781e-02f, + 1.7135672e-02f, 1.7298007e-02f, 1.7461800e-02f, 1.7627063e-02f, 1.7793810e-02f, 1.7962053e-02f, 1.8131805e-02f, 1.8303080e-02f, + 1.8475891e-02f, 1.8650252e-02f, 1.8826176e-02f, 1.9003676e-02f, 1.9182767e-02f, 1.9363463e-02f, 1.9545777e-02f, 1.9729724e-02f, + 1.9915319e-02f, 2.0102575e-02f, 2.0291507e-02f, 2.0482131e-02f, 2.0674460e-02f, 2.0868510e-02f, 2.1064296e-02f, 2.1261833e-02f, + 2.1461136e-02f, 2.1662222e-02f, 2.1865105e-02f, 2.2069802e-02f, 2.2276328e-02f, 2.2484700e-02f, 2.2694934e-02f, 2.2907045e-02f, + 2.3121064e-02f, 2.3336982e-02f, 2.3554827e-02f, 2.3774618e-02f, 2.3996370e-02f, 2.4220102e-02f, 2.4445831e-02f, 2.4673574e-02f, + 2.4903349e-02f, 2.5135174e-02f, 2.5369067e-02f, 2.5605046e-02f, 2.5843129e-02f, 2.6083336e-02f, 2.6325684e-02f, 2.6570192e-02f, + 2.6816880e-02f, 2.7065767e-02f, 2.7316872e-02f, 2.7570215e-02f, 2.7825815e-02f, 2.8083692e-02f, 2.8343867e-02f, 2.8606359e-02f, + 2.8871189e-02f, 2.9138378e-02f, 2.9407946e-02f, 2.9679914e-02f, 2.9954304e-02f, 3.0231137e-02f, 3.0510434e-02f, 3.0792217e-02f, + 3.1076508e-02f, 3.1363330e-02f, 3.1652704e-02f, 3.1944653e-02f, 3.2239199e-02f, 3.2536367e-02f, 3.2836178e-02f, 3.3138657e-02f, + 3.3443826e-02f, 3.3751710e-02f, 3.4062333e-02f, 3.4375718e-02f, 3.4691890e-02f, 3.5010874e-02f, 3.5332694e-02f, 3.5657377e-02f, + 3.5984946e-02f, 3.6315428e-02f, 3.6648848e-02f, 3.6985233e-02f, 3.7324608e-02f, 3.7667000e-02f, 3.8012436e-02f, 3.8360942e-02f, + 3.8712547e-02f, 3.9067276e-02f, 3.9425159e-02f, 3.9786223e-02f, 4.0150496e-02f, 4.0518006e-02f, 4.0888783e-02f, 4.1262855e-02f, + 4.1640274e-02f, 4.2021025e-02f, 4.2405159e-02f, 4.2792707e-02f, 4.3183699e-02f, 4.3578166e-02f, 4.3976138e-02f, 4.4377647e-02f, + 4.4782724e-02f, 4.5191401e-02f, 4.5603709e-02f, 4.6019681e-02f, 4.6439350e-02f, 4.6862749e-02f, 4.7289910e-02f, 4.7720867e-02f, + 4.8155654e-02f, 4.8594305e-02f, 4.9036854e-02f, 4.9483336e-02f, 4.9933787e-02f, 5.0388240e-02f, 5.0846733e-02f, 5.1309301e-02f, + 5.1775981e-02f, 5.2246808e-02f, 5.2721821e-02f, 5.3201056e-02f, 5.3684551e-02f, 5.4172344e-02f, 5.4664473e-02f, 5.5160978e-02f, + 5.5661897e-02f, 5.6167269e-02f, 5.6677135e-02f, 5.7191535e-02f, 5.7710508e-02f, 5.8234097e-02f, 5.8762342e-02f, 5.9295285e-02f, + 5.9832968e-02f, 6.0375433e-02f, 6.0922723e-02f, 6.1474882e-02f, 6.2031952e-02f, 6.2593979e-02f, 6.3161006e-02f, 6.3733078e-02f, + 6.4310241e-02f, 6.4892540e-02f, 6.5480021e-02f, 6.6072730e-02f, 6.6670715e-02f, 6.7274023e-02f, 6.7882702e-02f, 6.8496800e-02f, + 6.9116365e-02f, 6.9741447e-02f, 7.0372096e-02f, 7.1008361e-02f, 7.1650293e-02f, 7.2297942e-02f, 7.2951361e-02f, 7.3610602e-02f, + 7.4275756e-02f, 7.4946797e-02f, 7.5623818e-02f, 7.6306873e-02f, 7.6996016e-02f, 7.7691302e-02f, 7.8392787e-02f, 7.9100526e-02f, + 7.9814576e-02f, 8.0534993e-02f, 8.1261837e-02f, 8.1995163e-02f, 8.2735032e-02f, 8.3481501e-02f, 8.4234632e-02f, 8.4994483e-02f, + 8.5761116e-02f, 8.6534592e-02f, 8.7314974e-02f, 8.8102323e-02f, 8.8896702e-02f, 8.9698176e-02f, 9.0506809e-02f, 9.1322665e-02f, + 9.2145810e-02f, 9.2976310e-02f, 9.3814232e-02f, 9.4659643e-02f, 9.5512612e-02f, 9.6373206e-02f, 9.7241496e-02f, 9.8117550e-02f, + 9.9001441e-02f, 9.9893238e-02f, 1.0079301e-01f, 1.0170084e-01f, 1.0261679e-01f, 1.0354094e-01f, 1.0447337e-01f, 1.0541414e-01f, + 1.0636334e-01f, 1.0732104e-01f, 1.0828731e-01f, 1.0926225e-01f, 1.1024592e-01f, 1.1123841e-01f, 1.1223979e-01f, 1.1325016e-01f, + 1.1426958e-01f, 1.1529814e-01f, 1.1633594e-01f, 1.1738304e-01f, 1.1843954e-01f, 1.1950552e-01f, 1.2058107e-01f, 1.2166627e-01f, + 1.2276122e-01f, 1.2386601e-01f, 1.2498072e-01f, 1.2610544e-01f, 1.2724027e-01f, 1.2838531e-01f, 1.2954063e-01f, 1.3070635e-01f, + 1.3188262e-01f, 1.3306940e-01f, 1.3426686e-01f, 1.3547509e-01f, 1.3669420e-01f, 1.3792428e-01f, 1.3916544e-01f, 1.4041778e-01f, + 1.4168140e-01f, 1.4295640e-01f, 1.4424289e-01f, 1.4554098e-01f, 1.4685078e-01f, 1.4817238e-01f, 1.4950591e-01f, 1.5085147e-01f, + 1.5220916e-01f, 1.5357912e-01f, 1.5496144e-01f, 1.5635624e-01f, 1.5776364e-01f, 1.5918375e-01f, 1.6061670e-01f, 1.6206260e-01f, + 1.6352156e-01f, 1.6499372e-01f, 1.6647920e-01f, 1.6797811e-01f, 1.6949059e-01f, 1.7101676e-01f, 1.7255674e-01f, 1.7411067e-01f, + 1.7567867e-01f, 1.7726087e-01f, 1.7885742e-01f, 1.8046844e-01f, 1.8209406e-01f, 1.8373443e-01f, 1.8538967e-01f, 1.8705994e-01f, + 1.8874536e-01f, 1.9044608e-01f, 1.9216225e-01f, 1.9389401e-01f, 1.9564150e-01f, 1.9740486e-01f, 1.9918426e-01f, 2.0097984e-01f, + 2.0279175e-01f, 2.0462014e-01f, 2.0646517e-01f, 2.0832699e-01f, 2.1020577e-01f, 2.1210165e-01f, 2.1401481e-01f, 2.1594540e-01f, + 2.1789359e-01f, 2.1985954e-01f, 2.2184342e-01f, 2.2384540e-01f, 2.2586565e-01f, 2.2790434e-01f, 2.2996165e-01f, 2.3203774e-01f, + 2.3413293e-01f, 2.3624714e-01f, 2.3838068e-01f, 2.4053372e-01f, 2.4270646e-01f, 2.4489908e-01f, 2.4711177e-01f, 2.4934471e-01f, + 2.5159811e-01f, 2.5387214e-01f, 2.5616702e-01f, 2.5848293e-01f, 2.6082007e-01f, 2.6317866e-01f, 2.6555888e-01f, 2.6796095e-01f, + 2.7038507e-01f, 2.7283145e-01f, 2.7530031e-01f, 2.7779186e-01f, 2.8030631e-01f, 2.8284388e-01f, 2.8540479e-01f, 2.8798927e-01f, + 2.9059754e-01f, 2.9322983e-01f, 2.9588635e-01f, 2.9856736e-01f, 3.0127308e-01f, 3.0400374e-01f, 3.0675959e-01f, 3.0954086e-01f, + 3.1234780e-01f, 3.1518066e-01f, 3.1803969e-01f, 3.2092512e-01f, 3.2383723e-01f, 3.2677625e-01f, 3.2974246e-01f, 3.3273611e-01f, + 3.3575747e-01f, 3.3880680e-01f, 3.4188437e-01f, 3.4499045e-01f, 3.4812533e-01f, 3.5128926e-01f, 3.5448255e-01f, 3.5770546e-01f, + 3.6095828e-01f, 3.6424131e-01f, 3.6755483e-01f, 3.7089914e-01f, 3.7427454e-01f, 3.7768132e-01f, 3.8111979e-01f, 3.8459027e-01f, + 3.8809304e-01f, 3.9162844e-01f, 3.9519678e-01f, 3.9879837e-01f, 4.0243354e-01f, 4.0610261e-01f, 4.0980592e-01f, 4.1354380e-01f, + 4.1731681e-01f, 4.2112483e-01f, 4.2496844e-01f, 4.2884798e-01f, 4.3276381e-01f, 4.3671627e-01f, 4.4070572e-01f, 4.4473253e-01f, + 4.4879706e-01f, 4.5289968e-01f, 4.5704076e-01f, 4.6122068e-01f, 4.6543981e-01f, 4.6969854e-01f, 4.7399727e-01f, 4.7833637e-01f, + 4.8271625e-01f, 4.8713731e-01f, 4.9159995e-01f, 4.9610458e-01f, 5.0065162e-01f, 5.0524147e-01f, 5.0987457e-01f, 5.1455133e-01f, + 5.1927219e-01f, 5.2403759e-01f, 5.2884795e-01f, 5.3370373e-01f, 5.3860537e-01f, 5.4355333e-01f, 5.4854807e-01f, 5.5359004e-01f, + 5.5867972e-01f, 5.6381757e-01f, 5.6900408e-01f, 5.7423972e-01f, 5.7952499e-01f, 5.8486037e-01f, 5.9024637e-01f, 5.9568349e-01f, + 6.0117223e-01f, 6.0671311e-01f, 6.1230664e-01f, 6.1795336e-01f, 6.2365379e-01f, 6.2940847e-01f, 6.3521793e-01f, 6.4108273e-01f, + 6.4700342e-01f, 6.5298056e-01f, 6.5901471e-01f, 6.6510643e-01f, 6.7125632e-01f, 6.7746495e-01f, 6.8373290e-01f, 6.9006078e-01f, + 6.9644918e-01f, 7.0289872e-01f, 7.0941001e-01f, 7.1598366e-01f, 7.2262031e-01f, 7.2932059e-01f, 7.3608513e-01f, 7.4291460e-01f, + 7.4981006e-01f, 7.5677134e-01f, 7.6379952e-01f, 7.7089527e-01f, 7.7805929e-01f, 7.8529226e-01f, 7.9259489e-01f, 7.9996786e-01f, + 8.0741191e-01f, 8.1492774e-01f, 8.2251609e-01f, 8.3017769e-01f, 8.3791329e-01f, 8.4572364e-01f, 8.5360950e-01f, 8.6157163e-01f, + 8.6961082e-01f, 8.7772786e-01f, 8.8592352e-01f, 8.9419862e-01f, 9.0255397e-01f, 9.1099038e-01f, 9.1950869e-01f, 9.2810973e-01f, + 9.3679435e-01f, 9.4556340e-01f, 9.5441776e-01f, 9.6335829e-01f, 9.7238588e-01f, 9.8150143e-01f, 9.9070583e-01f, 1.0000000e+00f, + 1.0f, // extra padding to avoid out of bounds access +}; + +static inline float pq_eotf(float x) +{ + float idxf = fminf(fmaxf(x, 0.0f), 1.0f) * (PQ_LUT_SIZE - 1); + int ipart = floorf(idxf); + float fpart = idxf - ipart; + return PL_MIX(pq_eotf_lut[ipart], pq_eotf_lut[ipart + 1], fpart); +} + +static inline float pq_oetf(float x) +{ + x = powf(fmaxf(x, 0.0f), PQ_M1); + x = (PQ_C1 + PQ_C2 * x) / (1.0f + PQ_C3 * x); + return powf(x, PQ_M2); +} + +// Helper struct containing pre-computed cached values describing a gamut +struct gamut { + pl_matrix3x3 lms2rgb; + pl_matrix3x3 rgb2lms; + float min_luma, max_luma; // pq + float min_rgb, max_rgb; // 10k normalized + struct ICh *peak_cache; // 1-item cache for computed peaks (per hue) +}; + +struct cache { + struct ICh src_cache; + struct ICh dst_cache; +}; + +static void get_gamuts(struct gamut *dst, struct gamut *src, struct cache *cache, + const struct pl_gamut_map_params *params) +{ + const float epsilon = 1e-6; + memset(cache, 0, sizeof(*cache)); + struct gamut base = { + .min_luma = params->min_luma, + .max_luma = params->max_luma, + .min_rgb = pq_eotf(params->min_luma) - epsilon, + .max_rgb = pq_eotf(params->max_luma) + epsilon, + }; + + if (dst) { + *dst = base; + dst->lms2rgb = dst->rgb2lms = pl_ipt_rgb2lms(¶ms->output_gamut); + dst->peak_cache = &cache->dst_cache; + pl_matrix3x3_invert(&dst->lms2rgb); + } + + if (src) { + *src = base; + src->lms2rgb = src->rgb2lms = pl_ipt_rgb2lms(¶ms->input_gamut); + src->peak_cache = &cache->src_cache; + pl_matrix3x3_invert(&src->lms2rgb); + } +} + +static inline struct IPT rgb2ipt(struct RGB c, struct gamut gamut) +{ + const float L = gamut.rgb2lms.m[0][0] * c.R + + gamut.rgb2lms.m[0][1] * c.G + + gamut.rgb2lms.m[0][2] * c.B; + const float M = gamut.rgb2lms.m[1][0] * c.R + + gamut.rgb2lms.m[1][1] * c.G + + gamut.rgb2lms.m[1][2] * c.B; + const float S = gamut.rgb2lms.m[2][0] * c.R + + gamut.rgb2lms.m[2][1] * c.G + + gamut.rgb2lms.m[2][2] * c.B; + const float Lp = pq_oetf(L); + const float Mp = pq_oetf(M); + const float Sp = pq_oetf(S); + return (struct IPT) { + .I = 0.4000f * Lp + 0.4000f * Mp + 0.2000f * Sp, + .P = 4.4550f * Lp - 4.8510f * Mp + 0.3960f * Sp, + .T = 0.8056f * Lp + 0.3572f * Mp - 1.1628f * Sp, + }; +} + +static inline struct RGB ipt2rgb(struct IPT c, struct gamut gamut) +{ + const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T; + const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T; + const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T; + const float L = pq_eotf(Lp); + const float M = pq_eotf(Mp); + const float S = pq_eotf(Sp); + return (struct RGB) { + .R = gamut.lms2rgb.m[0][0] * L + + gamut.lms2rgb.m[0][1] * M + + gamut.lms2rgb.m[0][2] * S, + .G = gamut.lms2rgb.m[1][0] * L + + gamut.lms2rgb.m[1][1] * M + + gamut.lms2rgb.m[1][2] * S, + .B = gamut.lms2rgb.m[2][0] * L + + gamut.lms2rgb.m[2][1] * M + + gamut.lms2rgb.m[2][2] * S, + }; +} + +static inline bool ingamut(struct IPT c, struct gamut gamut) +{ + const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T; + const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T; + const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T; + if (Lp < gamut.min_luma || Lp > gamut.max_luma || + Mp < gamut.min_luma || Mp > gamut.max_luma || + Sp < gamut.min_luma || Sp > gamut.max_luma) + { + // Early exit for values outside legal LMS range + return false; + } + + const float L = pq_eotf(Lp); + const float M = pq_eotf(Mp); + const float S = pq_eotf(Sp); + struct RGB rgb = { + .R = gamut.lms2rgb.m[0][0] * L + + gamut.lms2rgb.m[0][1] * M + + gamut.lms2rgb.m[0][2] * S, + .G = gamut.lms2rgb.m[1][0] * L + + gamut.lms2rgb.m[1][1] * M + + gamut.lms2rgb.m[1][2] * S, + .B = gamut.lms2rgb.m[2][0] * L + + gamut.lms2rgb.m[2][1] * M + + gamut.lms2rgb.m[2][2] * S, + }; + return rgb.R >= gamut.min_rgb && rgb.R <= gamut.max_rgb && + rgb.G >= gamut.min_rgb && rgb.G <= gamut.max_rgb && + rgb.B >= gamut.min_rgb && rgb.B <= gamut.max_rgb; +} + +struct generate_args { + const struct pl_gamut_map_params *params; + float *out; + int start; + int count; +}; + +static PL_THREAD_VOID generate(void *priv) +{ + const struct generate_args *args = priv; + const struct pl_gamut_map_params *params = args->params; + + float *in = args->out; + const int end = args->start + args->count; + for (int h = args->start; h < end; h++) { + for (int C = 0; C < params->lut_size_C; C++) { + for (int I = 0; I < params->lut_size_I; I++) { + float Ix = (float) I / (params->lut_size_I - 1); + float Cx = (float) C / (params->lut_size_C - 1); + float hx = (float) h / (params->lut_size_h - 1); + struct IPT ipt = ich2ipt((struct ICh) { + .I = PL_MIX(params->min_luma, params->max_luma, Ix), + .C = PL_MIX(0.0f, 0.5f, Cx), + .h = PL_MIX(-M_PI, M_PI, hx), + }); + in[0] = ipt.I; + in[1] = ipt.P; + in[2] = ipt.T; + in += params->lut_stride; + } + } + } + + struct pl_gamut_map_params fixed = *params; + fix_constants(&fixed.constants); + fixed.lut_size_h = args->count; + FUN(params).map(args->out, &fixed); + PL_THREAD_RETURN(); +} + +void pl_gamut_map_generate(float *out, const struct pl_gamut_map_params *params) +{ + enum { MAX_WORKERS = 32 }; + struct generate_args args[MAX_WORKERS]; + + const int num_per_worker = PL_DIV_UP(params->lut_size_h, MAX_WORKERS); + const int num_workers = PL_DIV_UP(params->lut_size_h, num_per_worker); + for (int i = 0; i < num_workers; i++) { + const int start = i * num_per_worker; + const int count = PL_MIN(num_per_worker, params->lut_size_h - start); + args[i] = (struct generate_args) { + .params = params, + .out = out, + .start = start, + .count = count, + }; + out += count * params->lut_size_C * params->lut_size_I * params->lut_stride; + } + + pl_thread workers[MAX_WORKERS] = {0}; + for (int i = 0; i < num_workers; i++) { + if (pl_thread_create(&workers[i], generate, &args[i]) != 0) + generate(&args[i]); // fallback + } + + for (int i = 0; i < num_workers; i++) { + if (!workers[i]) + continue; + if (pl_thread_join(workers[i]) != 0) + generate(&args[i]); // fallback + } +} + +void pl_gamut_map_sample(float x[3], const struct pl_gamut_map_params *params) +{ + struct pl_gamut_map_params fixed = *params; + fix_constants(&fixed.constants); + fixed.lut_size_I = fixed.lut_size_C = fixed.lut_size_h = 1; + fixed.lut_stride = 3; + + FUN(params).map(x, &fixed); +} + +#define LUT_SIZE(p) (p->lut_size_I * p->lut_size_C * p->lut_size_h * p->lut_stride) +#define FOREACH_LUT(lut, C) \ + for (struct IPT *_i = (struct IPT *) lut, \ + *_end = (struct IPT *) (lut + LUT_SIZE(params)), \ + C; \ + _i < _end && ( C = *_i, 1 ); \ + *_i = C, _i = (struct IPT *) ((float *) _i + params->lut_stride)) + +// Something like PL_MIX(base, c, x) but follows an exponential curve, note +// that this can be used to extend 'c' outwards for x > 1 +static inline struct ICh mix_exp(struct ICh c, float x, float gamma, float base) +{ + return (struct ICh) { + .I = base + (c.I - base) * powf(x, gamma), + .C = c.C * x, + .h = c.h, + }; +} + +// Drop gamma for colors approaching black and achromatic to avoid numerical +// instabilities, and excessive brightness boosting of grain, while also +// strongly boosting gamma for values exceeding the target peak +static inline float scale_gamma(float gamma, struct ICh ich, struct ICh peak, + struct gamut gamut) +{ + const float Imin = gamut.min_luma; + const float Irel = fmaxf((ich.I - Imin) / (peak.I - Imin), 0.0f); + return gamma * powf(Irel, 3) * fminf(ich.C / peak.C, 1.0f); +} + +static const float maxDelta = 5e-5f; + +// Find gamut intersection using specified bounds +static inline struct ICh +desat_bounded(float I, float h, float Cmin, float Cmax, struct gamut gamut) +{ + if (I <= gamut.min_luma) + return (struct ICh) { .I = gamut.min_luma, .C = 0, .h = h }; + if (I >= gamut.max_luma) + return (struct ICh) { .I = gamut.max_luma, .C = 0, .h = h }; + + const float maxDI = I * maxDelta; + struct ICh res = { .I = I, .C = (Cmin + Cmax) / 2, .h = h }; + do { + if (ingamut(ich2ipt(res), gamut)) { + Cmin = res.C; + } else { + Cmax = res.C; + } + res.C = (Cmin + Cmax) / 2; + } while (Cmax - Cmin > maxDI); + + return res; +} + +// Finds maximally saturated in-gamut color (for given hue) +static inline struct ICh saturate(float hue, struct gamut gamut) +{ + if (gamut.peak_cache->I && fabsf(gamut.peak_cache->h - hue) < 1e-3) + return *gamut.peak_cache; + + static const float invphi = 0.6180339887498948f; + static const float invphi2 = 0.38196601125010515f; + + struct ICh lo = { .I = gamut.min_luma, .h = hue }; + struct ICh hi = { .I = gamut.max_luma, .h = hue }; + float de = hi.I - lo.I; + struct ICh a = { .I = lo.I + invphi2 * de }; + struct ICh b = { .I = lo.I + invphi * de }; + a = desat_bounded(a.I, hue, 0.0f, 0.5f, gamut); + b = desat_bounded(b.I, hue, 0.0f, 0.5f, gamut); + + while (de > maxDelta) { + de *= invphi; + if (a.C > b.C) { + hi = b; + b = a; + a.I = lo.I + invphi2 * de; + a = desat_bounded(a.I, hue, lo.C - maxDelta, 0.5f, gamut); + } else { + lo = a; + a = b; + b.I = lo.I + invphi * de; + b = desat_bounded(b.I, hue, hi.C - maxDelta, 0.5f, gamut); + } + } + + struct ICh peak = a.C > b.C ? a : b; + *gamut.peak_cache = peak; + return peak; +} + +// Clip a color along the exponential curve given by `gamma` +static inline struct IPT +clip_gamma(struct IPT ipt, float gamma, struct gamut gamut) +{ + if (ipt.I <= gamut.min_luma) + return (struct IPT) { .I = gamut.min_luma }; + if (ingamut(ipt, gamut)) + return ipt; + + struct ICh ich = ipt2ich(ipt); + if (!gamma) + return ich2ipt(desat_bounded(ich.I, ich.h, 0.0f, ich.C, gamut)); + + const float maxDI = fmaxf(ich.I * maxDelta, 1e-7f); + struct ICh peak = saturate(ich.h, gamut); + gamma = scale_gamma(gamma, ich, peak, gamut); + float lo = 0.0f, hi = 1.0f, x = 0.5f; + do { + struct ICh test = mix_exp(ich, x, gamma, peak.I); + if (ingamut(ich2ipt(test), gamut)) { + lo = x; + } else { + hi = x; + } + x = (lo + hi) / 2.0f; + } while (hi - lo > maxDI); + + return ich2ipt(mix_exp(ich, x, gamma, peak.I)); +} + +static float softclip(float value, float source, float target, + const struct pl_gamut_map_constants *c) +{ + if (!target) + return 0.0f; + const float peak = source / target; + const float x = fminf(value / target, peak); + const float j = c->softclip_knee; + if (x <= j || peak <= 1.0) + return value; + // Apply simple mobius function + const float a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak); + const float b = (j*j - 2.0f * j * peak + peak) / + fmaxf(1e-6f, peak - 1.0f); + const float scale = (b*b + 2.0f * b*j + j*j) / (b - a); + return scale * (x + a) / (x + b) * target; +} + +static int cmp_float(const void *a, const void *b) +{ + float fa = *(const float*) a; + float fb = *(const float*) b; + return PL_CMP(fa, fb); +} + +static float wrap(float h) +{ + if (h > M_PI) { + return h - 2 * M_PI; + } else if (h < -M_PI) { + return h + 2 * M_PI; + } else { + return h; + } +} + +enum { + S = 12, // number of hue shift vertices + N = S + 2, // +2 for the endpoints +}; + +// Hue-shift helper struct +struct hueshift { + float dh[N]; + float dddh[N]; + float K[N]; + float prev_hue; + float prev_shift; + struct { float hue, delta; } hueshift[N]; +}; + +static void hueshift_prepare(struct hueshift *s, struct gamut src, struct gamut dst) +{ + const float O = pq_eotf(src.min_luma), X = pq_eotf(src.max_luma); + const float M = (O + X) / 2.0f; + const struct RGB refpoints[S] = { + {X, O, O}, {O, X, O}, {O, O, X}, + {O, X, X}, {X, O, X}, {X, X, O}, + {O, X, M}, {X, O, M}, {X, M, O}, + {O, M, X}, {M, O, X}, {M, X, O}, + }; + + memset(s, 0, sizeof(*s)); + for (int i = 0; i < S; i++) { + struct ICh ich_src = ipt2ich(rgb2ipt(refpoints[i], src)); + struct ICh ich_dst = ipt2ich(rgb2ipt(refpoints[i], dst)); + const float delta = wrap(ich_dst.h - ich_src.h); + s->hueshift[i+1].hue = ich_src.h; + s->hueshift[i+1].delta = delta; + } + + // Sort and wrap endpoints + qsort(s->hueshift + 1, S, sizeof(*s->hueshift), cmp_float); + s->hueshift[0] = s->hueshift[S]; + s->hueshift[S+1] = s->hueshift[1]; + s->hueshift[0].hue -= 2 * M_PI; + s->hueshift[S+1].hue += 2 * M_PI; + + // Construction of cubic spline coefficients + float tmp[N][N] = {0}; + for (int i = N - 1; i > 0; i--) { + s->dh[i-1] = s->hueshift[i].hue - s->hueshift[i-1].hue; + s->dddh[i] = (s->hueshift[i].delta - s->hueshift[i-1].delta) / s->dh[i-1]; + } + for (int i = 1; i < N - 1; i++) { + tmp[i][i] = 2 * (s->dh[i-1] + s->dh[i]); + if (i != 1) + tmp[i][i-1] = tmp[i-1][i] = s->dh[i-1]; + tmp[i][N-1] = 6 * (s->dddh[i+1] - s->dddh[i]); + } + for (int i = 1; i < N - 2; i++) { + const float q = (tmp[i+1][i] / tmp[i][i]); + for (int j = 1; j <= N - 1; j++) + tmp[i+1][j] -= q * tmp[i][j]; + } + for (int i = N - 2; i > 0; i--) { + float sum = 0.0f; + for (int j = i; j <= N - 2; j++) + sum += tmp[i][j] * s->K[j]; + s->K[i] = (tmp[i][N-1] - sum) / tmp[i][i]; + } + + s->prev_hue = -10.0f; +} + +static struct ICh hueshift_apply(struct hueshift *s, struct ICh ich) +{ + if (fabsf(ich.h - s->prev_hue) < 1e-6f) + goto done; + + // Determine perceptual hue shift delta by interpolation of refpoints + for (int i = 0; i < N - 1; i++) { + if (s->hueshift[i+1].hue > ich.h) { + pl_assert(s->hueshift[i].hue <= ich.h); + float a = (s->K[i+1] - s->K[i]) / (6 * s->dh[i]); + float b = s->K[i] / 2; + float c = s->dddh[i+1] - (2 * s->dh[i] * s->K[i] + s->K[i+1] * s->dh[i]) / 6; + float d = s->hueshift[i].delta; + float x = ich.h - s->hueshift[i].hue; + float delta = ((a * x + b) * x + c) * x + d; + s->prev_shift = ich.h + delta; + s->prev_hue = ich.h; + break; + } + } + +done: + return (struct ICh) { + .I = ich.I, + .C = ich.C, + .h = s->prev_shift, + }; +} + +static void perceptual(float *lut, const struct pl_gamut_map_params *params) +{ + const struct pl_gamut_map_constants *c = ¶ms->constants; + struct cache cache; + struct gamut dst, src; + get_gamuts(&dst, &src, &cache, params); + + FOREACH_LUT(lut, ipt) { + struct ICh ich = ipt2ich(ipt); + struct ICh src_peak = saturate(ich.h, src); + struct ICh dst_peak = saturate(ich.h, dst); + struct IPT mapped = rgb2ipt(ipt2rgb(ipt, src), dst); + + // Protect in gamut region + const float maxC = fmaxf(src_peak.C, dst_peak.C); + float k = pl_smoothstep(c->perceptual_deadzone, 1.0f, ich.C / maxC); + k *= c->perceptual_strength; + ipt.I = PL_MIX(ipt.I, mapped.I, k); + ipt.P = PL_MIX(ipt.P, mapped.P, k); + ipt.T = PL_MIX(ipt.T, mapped.T, k); + + struct RGB rgb = ipt2rgb(ipt, dst); + const float maxRGB = fmaxf(rgb.R, fmaxf(rgb.G, rgb.B)); + rgb.R = fmaxf(softclip(rgb.R, maxRGB, dst.max_rgb, c), dst.min_rgb); + rgb.G = fmaxf(softclip(rgb.G, maxRGB, dst.max_rgb, c), dst.min_rgb); + rgb.B = fmaxf(softclip(rgb.B, maxRGB, dst.max_rgb, c), dst.min_rgb); + ipt = rgb2ipt(rgb, dst); + } +} + +const struct pl_gamut_map_function pl_gamut_map_perceptual = { + .name = "perceptual", + .description = "Perceptual mapping", + .bidirectional = true, + .map = perceptual, +}; + +static void softclip_map(float *lut, const struct pl_gamut_map_params *params) +{ + const struct pl_gamut_map_constants *c = ¶ms->constants; + + // Separate cache after hueshift, because this invalidates previous cache + struct cache cache_pre, cache_post; + struct gamut dst_pre, src_pre, src_post, dst_post; + struct hueshift hueshift; + get_gamuts(&dst_pre, &src_pre, &cache_pre, params); + get_gamuts(&dst_post, &src_post, &cache_post, params); + hueshift_prepare(&hueshift, src_pre, dst_pre); + + FOREACH_LUT(lut, ipt) { + struct gamut src = src_pre; + struct gamut dst = dst_pre; + + if (ipt.I <= dst.min_luma) { + ipt.P = ipt.T = 0.0f; + continue; + } + + struct ICh ich = ipt2ich(ipt); + if (ich.C <= 1e-2f) + continue; // Fast path for achromatic colors + + float margin = 1.0f; + struct ICh shifted = hueshift_apply(&hueshift, ich); + if (fabsf(shifted.h - ich.h) >= 1e-3f) { + struct ICh src_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, src); + struct ICh dst_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, dst); + const float k = pl_smoothstep(dst_border.C * c->softclip_knee, + src_border.C, ich.C); + ich.h = PL_MIX(ich.h, shifted.h, k); + src = src_post; + dst = dst_post; + + // Expand/contract chromaticity margin to correspond to the altered + // size of the hue leaf after applying the hue delta + struct ICh shift_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, src); + margin *= fmaxf(1.0f, src_border.C / shift_border.C); + } + + // Determine intersections with source and target gamuts, and + // apply softclip to the chromaticity + struct ICh source = saturate(ich.h, src); + struct ICh target = saturate(ich.h, dst); + struct ICh border = desat_bounded(ich.I, ich.h, 0.0f, target.C, dst); + const float chromaticity = PL_MIX(target.C, border.C, c->softclip_desat); + ich.C = softclip(ich.C, margin * source.C, chromaticity, c); + + // Soft-clip the resulting RGB color. This will generally distort + // hues slightly, but hopefully in an aesthetically pleasing way. + struct ICh saturated = { ich.I, chromaticity, ich.h }; + struct RGB peak = ipt2rgb(ich2ipt(saturated), dst); + struct RGB rgb = ipt2rgb(ich2ipt(ich), dst); + rgb.R = fmaxf(softclip(rgb.R, peak.R, dst.max_rgb, c), dst.min_rgb); + rgb.G = fmaxf(softclip(rgb.G, peak.G, dst.max_rgb, c), dst.min_rgb); + rgb.B = fmaxf(softclip(rgb.B, peak.B, dst.max_rgb, c), dst.min_rgb); + ipt = rgb2ipt(rgb, dst); + } +} + +const struct pl_gamut_map_function pl_gamut_map_softclip = { + .name = "softclip", + .description = "Soft clipping", + .map = softclip_map, +}; + +static void relative(float *lut, const struct pl_gamut_map_params *params) +{ + const struct pl_gamut_map_constants *c = ¶ms->constants; + struct cache cache; + struct gamut dst; + get_gamuts(&dst, NULL, &cache, params); + + FOREACH_LUT(lut, ipt) + ipt = clip_gamma(ipt, c->colorimetric_gamma, dst); +} + +const struct pl_gamut_map_function pl_gamut_map_relative = { + .name = "relative", + .description = "Colorimetric clip", + .map = relative, +}; + +static void desaturate(float *lut, const struct pl_gamut_map_params *params) +{ + struct cache cache; + struct gamut dst; + get_gamuts(&dst, NULL, &cache, params); + + FOREACH_LUT(lut, ipt) + ipt = clip_gamma(ipt, 0.0f, dst); +} + +const struct pl_gamut_map_function pl_gamut_map_desaturate = { + .name = "desaturate", + .description = "Desaturating clip", + .map = desaturate, +}; + +static void saturation(float *lut, const struct pl_gamut_map_params *params) +{ + struct cache cache; + struct gamut dst, src; + get_gamuts(&dst, &src, &cache, params); + + FOREACH_LUT(lut, ipt) + ipt = rgb2ipt(ipt2rgb(ipt, src), dst); +} + +const struct pl_gamut_map_function pl_gamut_map_saturation = { + .name = "saturation", + .description = "Saturation mapping", + .bidirectional = true, + .map = saturation, +}; + +static void absolute(float *lut, const struct pl_gamut_map_params *params) +{ + const struct pl_gamut_map_constants *c = ¶ms->constants; + struct cache cache; + struct gamut dst; + get_gamuts(&dst, NULL, &cache, params); + pl_matrix3x3 m = pl_get_adaptation_matrix(params->output_gamut.white, + params->input_gamut.white); + + FOREACH_LUT(lut, ipt) { + struct RGB rgb = ipt2rgb(ipt, dst); + pl_matrix3x3_apply(&m, (float *) &rgb); + ipt = rgb2ipt(rgb, dst); + ipt = clip_gamma(ipt, c->colorimetric_gamma, dst); + } +} + +const struct pl_gamut_map_function pl_gamut_map_absolute = { + .name = "absolute", + .description = "Absolute colorimetric clip", + .map = absolute, +}; + +static void highlight(float *lut, const struct pl_gamut_map_params *params) +{ + struct cache cache; + struct gamut dst; + get_gamuts(&dst, NULL, &cache, params); + + FOREACH_LUT(lut, ipt) { + if (!ingamut(ipt, dst)) { + ipt.I = fminf(ipt.I + 0.1f, 1.0f); + ipt.P = fclampf(-1.2f * ipt.P, -0.5f, 0.5f); + ipt.T = fclampf(-1.2f * ipt.T, -0.5f, 0.5f); + } + } +} + +const struct pl_gamut_map_function pl_gamut_map_highlight = { + .name = "highlight", + .description = "Highlight out-of-gamut pixels", + .map = highlight, +}; + +static void linear(float *lut, const struct pl_gamut_map_params *params) +{ + struct cache cache; + struct gamut dst, src; + get_gamuts(&dst, &src, &cache, params); + + float gain = 1.0f; + for (float hue = -M_PI; hue < M_PI; hue += 0.1f) + gain = fminf(gain, saturate(hue, dst).C / saturate(hue, src).C); + + FOREACH_LUT(lut, ipt) { + struct ICh ich = ipt2ich(ipt); + ich.C *= gain; + ipt = ich2ipt(ich); + } +} + +const struct pl_gamut_map_function pl_gamut_map_linear = { + .name = "linear", + .description = "Linear desaturate", + .map = linear, +}; + +static void darken(float *lut, const struct pl_gamut_map_params *params) +{ + const struct pl_gamut_map_constants *c = ¶ms->constants; + struct cache cache; + struct gamut dst, src; + get_gamuts(&dst, &src, &cache, params); + + static const struct RGB points[6] = { + {1, 0, 0}, {0, 1, 0}, {0, 0, 1}, + {0, 1, 1}, {1, 0, 1}, {1, 1, 0}, + }; + + float gain = 1.0f; + for (int i = 0; i < PL_ARRAY_SIZE(points); i++) { + const struct RGB p = ipt2rgb(rgb2ipt(points[i], src), dst); + const float maxRGB = PL_MAX3(p.R, p.G, p.B); + gain = fminf(gain, 1.0 / maxRGB); + } + + FOREACH_LUT(lut, ipt) { + struct RGB rgb = ipt2rgb(ipt, dst); + rgb.R *= gain; + rgb.G *= gain; + rgb.B *= gain; + ipt = rgb2ipt(rgb, dst); + ipt = clip_gamma(ipt, c->colorimetric_gamma, dst); + } +} + +const struct pl_gamut_map_function pl_gamut_map_darken = { + .name = "darken", + .description = "Darken and clip", + .map = darken, +}; + +static void noop(float *lut, const struct pl_gamut_map_params *params) +{ + return; +} + +const struct pl_gamut_map_function pl_gamut_map_clip = { + .name = "clip", + .description = "No gamut mapping (hard clip)", + .map = noop, +}; + +const struct pl_gamut_map_function * const pl_gamut_map_functions[] = { + &pl_gamut_map_clip, + &pl_gamut_map_perceptual, + &pl_gamut_map_softclip, + &pl_gamut_map_relative, + &pl_gamut_map_saturation, + &pl_gamut_map_absolute, + &pl_gamut_map_desaturate, + &pl_gamut_map_darken, + &pl_gamut_map_highlight, + &pl_gamut_map_linear, + NULL +}; + +const int pl_num_gamut_map_functions = PL_ARRAY_SIZE(pl_gamut_map_functions) - 1; + +const struct pl_gamut_map_function *pl_find_gamut_map_function(const char *name) +{ + for (int i = 0; i < pl_num_gamut_map_functions; i++) { + if (strcmp(name, pl_gamut_map_functions[i]->name) == 0) + return pl_gamut_map_functions[i]; + } + + return NULL; +} diff --git a/src/glsl/glslang.cc b/src/glsl/glslang.cc new file mode 100644 index 0000000..2bc923c --- /dev/null +++ b/src/glsl/glslang.cc @@ -0,0 +1,121 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "config_internal.h" + +#include <assert.h> + +extern "C" { +#include "pl_alloc.h" +#include "pl_thread.h" +} + +#include <glslang/Public/ShaderLang.h> +#include <glslang/SPIRV/GlslangToSpv.h> +#include <glslang/build_info.h> + +#include "glslang.h" + +#if (GLSLANG_VERSION_MAJOR * 1000 + GLSLANG_VERSION_MINOR) >= 11013 +#include <glslang/Public/ResourceLimits.h> +#define DefaultTBuiltInResource *GetDefaultResources() +#endif + +using namespace glslang; + +static pl_static_mutex pl_glslang_mutex = PL_STATIC_MUTEX_INITIALIZER; +static int pl_glslang_refcount; + +bool pl_glslang_init(void) +{ + bool ret = true; + + pl_static_mutex_lock(&pl_glslang_mutex); + if (pl_glslang_refcount++ == 0) + ret = InitializeProcess(); + pl_static_mutex_unlock(&pl_glslang_mutex); + + return ret; +} + +void pl_glslang_uninit(void) +{ + pl_static_mutex_lock(&pl_glslang_mutex); + if (--pl_glslang_refcount == 0) + FinalizeProcess(); + pl_static_mutex_unlock(&pl_glslang_mutex); +} + +struct pl_glslang_res *pl_glslang_compile(struct pl_glsl_version glsl_ver, + struct pl_spirv_version spirv_ver, + enum glsl_shader_stage stage, + const char *text) +{ + assert(pl_glslang_refcount); + struct pl_glslang_res *res = pl_zalloc_ptr(NULL, res); + + EShLanguage lang; + switch (stage) { + case GLSL_SHADER_VERTEX: lang = EShLangVertex; break; + case GLSL_SHADER_FRAGMENT: lang = EShLangFragment; break; + case GLSL_SHADER_COMPUTE: lang = EShLangCompute; break; + default: abort(); + } + + TShader *shader = new TShader(lang); + + shader->setEnvClient(EShClientVulkan, (EShTargetClientVersion) spirv_ver.env_version); + shader->setEnvTarget(EShTargetSpv, (EShTargetLanguageVersion) spirv_ver.spv_version); + shader->setStrings(&text, 1); + + TBuiltInResource limits = DefaultTBuiltInResource; + limits.maxComputeWorkGroupSizeX = glsl_ver.max_group_size[0]; + limits.maxComputeWorkGroupSizeY = glsl_ver.max_group_size[1]; + limits.maxComputeWorkGroupSizeZ = glsl_ver.max_group_size[2]; + limits.minProgramTexelOffset = glsl_ver.min_gather_offset; + limits.maxProgramTexelOffset = glsl_ver.max_gather_offset; + + if (!shader->parse(&limits, 0, true, EShMsgDefault)) { + res->error_msg = pl_str0dup0(res, shader->getInfoLog()); + delete shader; + return res; + } + + TProgram *prog = new TProgram(); + prog->addShader(shader); + if (!prog->link(EShMsgDefault)) { + res->error_msg = pl_str0dup0(res, prog->getInfoLog()); + delete shader; + delete prog; + return res; + } + + SpvOptions options; + options.disableOptimizer = false; + options.stripDebugInfo = true; + options.optimizeSize = true; + options.validate = true; + std::vector<unsigned int> spirv; + GlslangToSpv(*prog->getIntermediate(lang), spirv, &options); + + res->success = true; + res->size = spirv.size() * sizeof(unsigned int); + res->data = pl_memdup(res, spirv.data(), res->size), + delete shader; + delete prog; + return res; +} diff --git a/src/glsl/glslang.h b/src/glsl/glslang.h new file mode 100644 index 0000000..a5965a5 --- /dev/null +++ b/src/glsl/glslang.h @@ -0,0 +1,57 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdlib.h> +#include <stdbool.h> + +typedef struct TLimits TLimits; +typedef struct TBuiltInResource TBuiltInResource; +#include <glslang/Include/ResourceLimits.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include "utils.h" + +bool pl_glslang_init(void); +void pl_glslang_uninit(void); + +struct pl_glslang_res { + // Compilation status + bool success; + const char *error_msg; + + // Compiled shader memory, or NULL + void *data; + size_t size; +}; + +// Compile GLSL into a SPIRV stream, if possible. The resulting +// pl_glslang_res can simply be freed with pl_free() when done. +struct pl_glslang_res *pl_glslang_compile(struct pl_glsl_version glsl_ver, + struct pl_spirv_version spirv_ver, + enum glsl_shader_stage stage, + const char *shader); + +extern const TBuiltInResource DefaultTBuiltInResource; + +#ifdef __cplusplus +} +#endif diff --git a/src/glsl/glslang_resources.c b/src/glsl/glslang_resources.c new file mode 100644 index 0000000..a111c15 --- /dev/null +++ b/src/glsl/glslang_resources.c @@ -0,0 +1,132 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "glslang.h" + +// Taken from glslang's examples, which apparently generally bases the choices +// on OpenGL specification limits +// +// Note: This lives in a separate file so we can compile this struct using C99 +// designated initializers instead of using C++ struct initializers, because +// the latter will break on every upstream struct extension. +const TBuiltInResource DefaultTBuiltInResource = { + .maxLights = 32, + .maxClipPlanes = 6, + .maxTextureUnits = 32, + .maxTextureCoords = 32, + .maxVertexAttribs = 64, + .maxVertexUniformComponents = 4096, + .maxVaryingFloats = 64, + .maxVertexTextureImageUnits = 32, + .maxCombinedTextureImageUnits = 80, + .maxTextureImageUnits = 32, + .maxFragmentUniformComponents = 4096, + .maxDrawBuffers = 32, + .maxVertexUniformVectors = 128, + .maxVaryingVectors = 8, + .maxFragmentUniformVectors = 16, + .maxVertexOutputVectors = 16, + .maxFragmentInputVectors = 15, + .minProgramTexelOffset = -8, + .maxProgramTexelOffset = 7, + .maxClipDistances = 8, + .maxComputeWorkGroupCountX = 65535, + .maxComputeWorkGroupCountY = 65535, + .maxComputeWorkGroupCountZ = 65535, + .maxComputeWorkGroupSizeX = 1024, + .maxComputeWorkGroupSizeY = 1024, + .maxComputeWorkGroupSizeZ = 64, + .maxComputeUniformComponents = 1024, + .maxComputeTextureImageUnits = 16, + .maxComputeImageUniforms = 8, + .maxComputeAtomicCounters = 8, + .maxComputeAtomicCounterBuffers = 1, + .maxVaryingComponents = 60, + .maxVertexOutputComponents = 64, + .maxGeometryInputComponents = 64, + .maxGeometryOutputComponents = 128, + .maxFragmentInputComponents = 128, + .maxImageUnits = 8, + .maxCombinedImageUnitsAndFragmentOutputs = 8, + .maxCombinedShaderOutputResources = 8, + .maxImageSamples = 0, + .maxVertexImageUniforms = 0, + .maxTessControlImageUniforms = 0, + .maxTessEvaluationImageUniforms = 0, + .maxGeometryImageUniforms = 0, + .maxFragmentImageUniforms = 8, + .maxCombinedImageUniforms = 8, + .maxGeometryTextureImageUnits = 16, + .maxGeometryOutputVertices = 256, + .maxGeometryTotalOutputComponents = 1024, + .maxGeometryUniformComponents = 1024, + .maxGeometryVaryingComponents = 64, + .maxTessControlInputComponents = 128, + .maxTessControlOutputComponents = 128, + .maxTessControlTextureImageUnits = 16, + .maxTessControlUniformComponents = 1024, + .maxTessControlTotalOutputComponents = 4096, + .maxTessEvaluationInputComponents = 128, + .maxTessEvaluationOutputComponents = 128, + .maxTessEvaluationTextureImageUnits = 16, + .maxTessEvaluationUniformComponents = 1024, + .maxTessPatchComponents = 120, + .maxPatchVertices = 32, + .maxTessGenLevel = 64, + .maxViewports = 16, + .maxVertexAtomicCounters = 0, + .maxTessControlAtomicCounters = 0, + .maxTessEvaluationAtomicCounters = 0, + .maxGeometryAtomicCounters = 0, + .maxFragmentAtomicCounters = 8, + .maxCombinedAtomicCounters = 8, + .maxAtomicCounterBindings = 1, + .maxVertexAtomicCounterBuffers = 0, + .maxTessControlAtomicCounterBuffers = 0, + .maxTessEvaluationAtomicCounterBuffers = 0, + .maxGeometryAtomicCounterBuffers = 0, + .maxFragmentAtomicCounterBuffers = 1, + .maxCombinedAtomicCounterBuffers = 1, + .maxAtomicCounterBufferSize = 16384, + .maxTransformFeedbackBuffers = 4, + .maxTransformFeedbackInterleavedComponents = 64, + .maxCullDistances = 8, + .maxCombinedClipAndCullDistances = 8, + .maxSamples = 4, + .maxMeshOutputVerticesNV = 256, + .maxMeshOutputPrimitivesNV = 512, + .maxMeshWorkGroupSizeX_NV = 32, + .maxMeshWorkGroupSizeY_NV = 1, + .maxMeshWorkGroupSizeZ_NV = 1, + .maxTaskWorkGroupSizeX_NV = 32, + .maxTaskWorkGroupSizeY_NV = 1, + .maxTaskWorkGroupSizeZ_NV = 1, + .maxMeshViewCountNV = 4, + .maxDualSourceDrawBuffersEXT = 1, + + .limits = { + .nonInductiveForLoops = 1, + .whileLoops = 1, + .doWhileLoops = 1, + .generalUniformIndexing = 1, + .generalAttributeMatrixVectorIndexing = 1, + .generalVaryingIndexing = 1, + .generalSamplerIndexing = 1, + .generalVariableIndexing = 1, + .generalConstantMatrixVectorIndexing = 1, + }, +}; diff --git a/src/glsl/meson.build b/src/glsl/meson.build new file mode 100644 index 0000000..5cebfb8 --- /dev/null +++ b/src/glsl/meson.build @@ -0,0 +1,73 @@ +# shaderc +shaderc = dependency('shaderc', version: '>=2019.1', required: get_option('shaderc')) +components.set('shaderc', shaderc.found()) +if shaderc.found() + build_deps += shaderc + sources += 'glsl/spirv_shaderc.c' +endif + +# glslang +glslang = disabler() +glslang_req = get_option('glslang') +if glslang_req.auto() and shaderc.found() + + # we only need one or the other, and shaderc is preferred + message('Skipping `glslang` because `shaderc` is available') + +elif not glslang_req.disabled() + + glslang_deps = [ + cxx.find_library('glslang-default-resource-limits', required: false) + ] + + # meson doesn't respect generator expressions in INTERFACE_LINK_LIBRARIES + # https://github.com/mesonbuild/meson/issues/8232 + # TODO: Use the following once it's fixed + # glslang = dependency('glslang', method: 'cmake', modules: ['glslang::SPIRV']) + + prefer_static = get_option('prefer_static') + found_lib = false + foreach arg : [[prefer_static, false], [not prefer_static, glslang_req]] + static = arg[0] + required = arg[1] + + spirv = cxx.find_library('SPIRV', required: required, static: static) + + if not spirv.found() + continue + endif + + glslang_deps += spirv + + if static + glslang_deps += [ + # Always required for static linking + cxx.find_library('MachineIndependent', required: true, static: true), + cxx.find_library('OSDependent', required: true, static: true), + cxx.find_library('OGLCompiler', required: true, static: true), + cxx.find_library('GenericCodeGen', required: true, static: true), + # SPIRV-Tools are required only if optimizer is enabled in glslang build + cxx.find_library('SPIRV-Tools', required: false, static: true), + cxx.find_library('SPIRV-Tools-opt', required: false, static: true), + ] + endif + + found_lib = true + break + endforeach + + if found_lib and cc.has_header('glslang/build_info.h') + glslang = declare_dependency(dependencies: glslang_deps) + endif + +endif + +components.set('glslang', glslang.found()) +if glslang.found() + build_deps += glslang + sources += [ + 'glsl/glslang.cc', + 'glsl/glslang_resources.c', + 'glsl/spirv_glslang.c', + ] +endif diff --git a/src/glsl/spirv.c b/src/glsl/spirv.c new file mode 100644 index 0000000..8317ed7 --- /dev/null +++ b/src/glsl/spirv.c @@ -0,0 +1,64 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "spirv.h" + +extern const struct spirv_compiler pl_spirv_shaderc; +extern const struct spirv_compiler pl_spirv_glslang; + +static const struct spirv_compiler *compilers[] = { +#ifdef PL_HAVE_SHADERC + &pl_spirv_shaderc, +#endif +#ifdef PL_HAVE_GLSLANG + &pl_spirv_glslang, +#endif +}; + +pl_spirv pl_spirv_create(pl_log log, struct pl_spirv_version spirv_ver) +{ + for (int i = 0; i < PL_ARRAY_SIZE(compilers); i++) { + pl_spirv spirv = compilers[i]->create(log, spirv_ver); + if (!spirv) + continue; + + pl_info(log, "Initialized SPIR-V compiler '%s'", compilers[i]->name); + return spirv; + } + + pl_fatal(log, "Failed initializing any SPIR-V compiler! Maybe libplacebo " + "was built without support for either libshaderc or glslang?"); + return NULL; +} + +void pl_spirv_destroy(pl_spirv *pspirv) +{ + pl_spirv spirv = *pspirv; + if (!spirv) + return; + + spirv->impl->destroy(spirv); + *pspirv = NULL; +} + +pl_str pl_spirv_compile_glsl(pl_spirv spirv, void *alloc, + struct pl_glsl_version glsl, + enum glsl_shader_stage stage, + const char *shader) +{ + return spirv->impl->compile(spirv, alloc, glsl, stage, shader); +} diff --git a/src/glsl/spirv.h b/src/glsl/spirv.h new file mode 100644 index 0000000..fa4494a --- /dev/null +++ b/src/glsl/spirv.h @@ -0,0 +1,50 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "log.h" +#include "utils.h" + +typedef const struct pl_spirv_t { + const struct spirv_compiler *impl; + pl_log log; + + // SPIR-V version specified at creation time. + struct pl_spirv_version version; + + // For cache invalidation, should uniquely identify everything about this + // spirv compiler and its configuration. + uint64_t signature; +} *pl_spirv; + +// Initialize a SPIR-V compiler instance, or returns NULL on failure. +pl_spirv pl_spirv_create(pl_log log, struct pl_spirv_version spirv_ver); +void pl_spirv_destroy(pl_spirv *spirv); + +// Compile GLSL to SPIR-V. Returns {0} on failure. +pl_str pl_spirv_compile_glsl(pl_spirv spirv, void *alloc, + struct pl_glsl_version glsl_ver, + enum glsl_shader_stage stage, + const char *shader); + +struct spirv_compiler { + const char *name; + void (*destroy)(pl_spirv spirv); + __typeof__(pl_spirv_create) *create; + __typeof__(pl_spirv_compile_glsl) *compile; +}; diff --git a/src/glsl/spirv_glslang.c b/src/glsl/spirv_glslang.c new file mode 100644 index 0000000..ffb8f55 --- /dev/null +++ b/src/glsl/spirv_glslang.c @@ -0,0 +1,112 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "hash.h" +#include "spirv.h" +#include "utils.h" +#include "glsl/glslang.h" + +// This header contains only preprocessor definitions +#include <glslang/build_info.h> + +// This is awkward, but we cannot use upstream macro, it was fixed in 11.11.0 +#define PL_GLSLANG_VERSION_GREATER_THAN(major, minor, patch) \ + ((GLSLANG_VERSION_MAJOR) > (major) || ((major) == GLSLANG_VERSION_MAJOR && \ + ((GLSLANG_VERSION_MINOR) > (minor) || ((minor) == GLSLANG_VERSION_MINOR && \ + (GLSLANG_VERSION_PATCH) > (patch))))) + +#if PL_GLSLANG_VERSION_GREATER_THAN(11, 8, 0) +#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 6) +#elif PL_GLSLANG_VERSION_GREATER_THAN(7, 13, 3496) +#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 5) +#elif PL_GLSLANG_VERSION_GREATER_THAN(6, 2, 2596) +#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 3) +#else +#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 0) +#endif + +const struct spirv_compiler pl_spirv_glslang; + +static void glslang_destroy(pl_spirv spirv) +{ + pl_glslang_uninit(); + pl_free((void *) spirv); +} + +static pl_spirv glslang_create(pl_log log, struct pl_spirv_version spirv_ver) +{ + if (!pl_glslang_init()) { + pl_fatal(log, "Failed initializing glslang SPIR-V compiler!"); + return NULL; + } + + struct pl_spirv_t *spirv = pl_alloc_ptr(NULL, spirv); + *spirv = (struct pl_spirv_t) { + .signature = pl_str0_hash(pl_spirv_glslang.name), + .impl = &pl_spirv_glslang, + .version = spirv_ver, + .log = log, + }; + + PL_INFO(spirv, "glslang version: %d.%d.%d", + GLSLANG_VERSION_MAJOR, + GLSLANG_VERSION_MINOR, + GLSLANG_VERSION_PATCH); + + // Clamp to supported version by glslang + if (GLSLANG_SPV_MAX < spirv->version.spv_version) { + spirv->version.spv_version = GLSLANG_SPV_MAX; + spirv->version.env_version = pl_spirv_version_to_vulkan(GLSLANG_SPV_MAX); + } + + pl_hash_merge(&spirv->signature, (uint64_t) spirv->version.spv_version << 32 | + spirv->version.env_version); + pl_hash_merge(&spirv->signature, (GLSLANG_VERSION_MAJOR & 0xFF) << 24 | + (GLSLANG_VERSION_MINOR & 0xFF) << 16 | + (GLSLANG_VERSION_PATCH & 0xFFFF)); + return spirv; +} + +static pl_str glslang_compile(pl_spirv spirv, void *alloc, + struct pl_glsl_version glsl_ver, + enum glsl_shader_stage stage, + const char *shader) +{ + struct pl_glslang_res *res; + + res = pl_glslang_compile(glsl_ver, spirv->version, stage, shader); + if (!res || !res->success) { + PL_ERR(spirv, "glslang failed: %s", res ? res->error_msg : "(null)"); + pl_free(res); + return (struct pl_str) {0}; + } + + struct pl_str ret = { + .buf = pl_steal(alloc, res->data), + .len = res->size, + }; + + pl_free(res); + return ret; +} + +const struct spirv_compiler pl_spirv_glslang = { + .name = "glslang", + .destroy = glslang_destroy, + .create = glslang_create, + .compile = glslang_compile, +}; diff --git a/src/glsl/spirv_shaderc.c b/src/glsl/spirv_shaderc.c new file mode 100644 index 0000000..e384382 --- /dev/null +++ b/src/glsl/spirv_shaderc.c @@ -0,0 +1,174 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdlib.h> +#include <shaderc/shaderc.h> + +#include "hash.h" +#include "spirv.h" +#include "utils.h" + +const struct spirv_compiler pl_spirv_shaderc; + +struct priv { + shaderc_compiler_t compiler; +}; + +static void shaderc_destroy(pl_spirv spirv) +{ + struct priv *p = PL_PRIV(spirv); + shaderc_compiler_release(p->compiler); + pl_free((void *) spirv); +} + +static pl_spirv shaderc_create(pl_log log, struct pl_spirv_version spirv_ver) +{ + struct pl_spirv_t *spirv = pl_alloc_obj(NULL, spirv, struct priv); + *spirv = (struct pl_spirv_t) { + .signature = pl_str0_hash(pl_spirv_shaderc.name), + .impl = &pl_spirv_shaderc, + .version = spirv_ver, + .log = log, + }; + + struct priv *p = PL_PRIV(spirv); + p->compiler = shaderc_compiler_initialize(); + if (!p->compiler) + goto error; + + unsigned int ver = 0, rev = 0; + shaderc_get_spv_version(&ver, &rev); + PL_INFO(spirv, "shaderc SPIR-V version %u.%u rev %u", + ver >> 16, (ver >> 8) & 0xff, rev); + + // Clamp to supported version by shaderc + if (ver < spirv->version.spv_version) { + spirv->version.spv_version = ver; + spirv->version.env_version = pl_spirv_version_to_vulkan(ver); + } + + pl_hash_merge(&spirv->signature, (uint64_t) spirv->version.spv_version << 32 | + spirv->version.env_version); + pl_hash_merge(&spirv->signature, (uint64_t) ver << 32 | rev); + return spirv; + +error: + shaderc_destroy(spirv); + return NULL; +} + +static pl_str shaderc_compile(pl_spirv spirv, void *alloc, + struct pl_glsl_version glsl_ver, + enum glsl_shader_stage stage, + const char *shader) +{ + struct priv *p = PL_PRIV(spirv); + const size_t len = strlen(shader); + + shaderc_compile_options_t opts = shaderc_compile_options_initialize(); + if (!opts) + return (pl_str) {0}; + + shaderc_compile_options_set_optimization_level(opts, + shaderc_optimization_level_performance); + shaderc_compile_options_set_target_spirv(opts, spirv->version.spv_version); + shaderc_compile_options_set_target_env(opts, shaderc_target_env_vulkan, + spirv->version.env_version); + + for (int i = 0; i < 3; i++) { + shaderc_compile_options_set_limit(opts, + shaderc_limit_max_compute_work_group_size_x + i, + glsl_ver.max_group_size[i]); + } + + shaderc_compile_options_set_limit(opts, + shaderc_limit_min_program_texel_offset, + glsl_ver.min_gather_offset); + shaderc_compile_options_set_limit(opts, + shaderc_limit_max_program_texel_offset, + glsl_ver.max_gather_offset); + + static const shaderc_shader_kind kinds[] = { + [GLSL_SHADER_VERTEX] = shaderc_glsl_vertex_shader, + [GLSL_SHADER_FRAGMENT] = shaderc_glsl_fragment_shader, + [GLSL_SHADER_COMPUTE] = shaderc_glsl_compute_shader, + }; + + static const char * const file_name = "input"; + static const char * const entry_point = "main"; + + shaderc_compilation_result_t res; + res = shaderc_compile_into_spv(p->compiler, shader, len, kinds[stage], + file_name, entry_point, opts); + + int errs = shaderc_result_get_num_errors(res), + warn = shaderc_result_get_num_warnings(res); + + enum pl_log_level lev = errs ? PL_LOG_ERR : warn ? PL_LOG_INFO : PL_LOG_DEBUG; + + int s = shaderc_result_get_compilation_status(res); + bool success = s == shaderc_compilation_status_success; + if (!success) + lev = PL_LOG_ERR; + + const char *msg = shaderc_result_get_error_message(res); + if (msg[0]) + PL_MSG(spirv, lev, "shaderc output:\n%s", msg); + + static const char *results[] = { + [shaderc_compilation_status_success] = "success", + [shaderc_compilation_status_invalid_stage] = "invalid stage", + [shaderc_compilation_status_compilation_error] = "error", + [shaderc_compilation_status_internal_error] = "internal error", + [shaderc_compilation_status_null_result_object] = "no result", + [shaderc_compilation_status_invalid_assembly] = "invalid assembly", + }; + + const char *status = s < PL_ARRAY_SIZE(results) ? results[s] : "unknown"; + PL_MSG(spirv, lev, "shaderc compile status '%s' (%d errors, %d warnings)", + status, errs, warn); + + pl_str ret = {0}; + if (success) { + void *bytes = (void *) shaderc_result_get_bytes(res); + pl_assert(bytes); + ret.len = shaderc_result_get_length(res); + ret.buf = pl_memdup(alloc, bytes, ret.len); + + if (pl_msg_test(spirv->log, PL_LOG_TRACE)) { + shaderc_compilation_result_t dis; + dis = shaderc_compile_into_spv_assembly(p->compiler, shader, len, + kinds[stage], file_name, + entry_point, opts); + PL_TRACE(spirv, "Generated SPIR-V:\n%.*s", + (int) shaderc_result_get_length(dis), + shaderc_result_get_bytes(dis)); + shaderc_result_release(dis); + } + } + + shaderc_result_release(res); + shaderc_compile_options_release(opts); + return ret; +} + +const struct spirv_compiler pl_spirv_shaderc = { + .name = "shaderc", + .destroy = shaderc_destroy, + .create = shaderc_create, + .compile = shaderc_compile, +}; diff --git a/src/glsl/utils.h b/src/glsl/utils.h new file mode 100644 index 0000000..965ea9e --- /dev/null +++ b/src/glsl/utils.h @@ -0,0 +1,52 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdbool.h> +#include <stdint.h> + +#include <libplacebo/gpu.h> + +#define PL_SPV_VERSION(major, minor) ((major) << 16 | (minor) << 8) +#define PL_VLK_VERSION(major, minor) ((major) << 22 | (minor) << 12) + +// Max version that can be used +#define PL_MAX_SPIRV_VER PL_SPV_VERSION(1, 6) + +struct pl_spirv_version { + uint32_t env_version; + uint32_t spv_version; +}; + +// Returns minimum Vulkan version for given SPIR-V version +static inline uint32_t pl_spirv_version_to_vulkan(uint32_t spirv_ver) +{ + if (spirv_ver >= PL_SPV_VERSION(1, 6)) + return PL_VLK_VERSION(1, 3); + if (spirv_ver >= PL_SPV_VERSION(1, 4)) + return PL_VLK_VERSION(1, 2); + if (spirv_ver >= PL_SPV_VERSION(1, 1)) + return PL_VLK_VERSION(1, 1); + return PL_VLK_VERSION(1, 0); +} + +enum glsl_shader_stage { + GLSL_SHADER_VERTEX = 0, + GLSL_SHADER_FRAGMENT, + GLSL_SHADER_COMPUTE, +}; diff --git a/src/gpu.c b/src/gpu.c new file mode 100644 index 0000000..b639ec2 --- /dev/null +++ b/src/gpu.c @@ -0,0 +1,1338 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "gpu.h" + +#define require(expr) pl_require(gpu, expr) + +void pl_gpu_destroy(pl_gpu gpu) +{ + if (!gpu) + return; + + struct pl_gpu_fns *impl = PL_PRIV(gpu); + pl_dispatch_destroy(&impl->dp); + impl->destroy(gpu); +} + +pl_dispatch pl_gpu_dispatch(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->dp; +} + +pl_cache pl_gpu_cache(pl_gpu gpu) +{ + if (!gpu) + return NULL; + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return atomic_load(&impl->cache); +} + +void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache) +{ + struct pl_gpu_fns *impl = PL_PRIV(gpu); + atomic_store(&impl->cache, cache); +} + +bool pl_fmt_is_ordered(pl_fmt fmt) +{ + bool ret = !fmt->opaque; + for (int i = 0; i < fmt->num_components; i++) + ret &= fmt->sample_order[i] == i; + return ret; +} + +bool pl_fmt_is_float(pl_fmt fmt) +{ + switch (fmt->type) { + case PL_FMT_UNKNOWN: // more likely than not + case PL_FMT_FLOAT: + case PL_FMT_UNORM: + case PL_FMT_SNORM: + return true; + + case PL_FMT_UINT: + case PL_FMT_SINT: + return false; + + case PL_FMT_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier) +{ + if (!fmt) + return false; + + for (int i = 0; i < fmt->num_modifiers; i++) { + if (fmt->modifiers[i] == modifier) + return true; + } + + return false; +} + +pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components, + int min_depth, int host_bits, enum pl_fmt_caps caps) +{ + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + if (fmt->type != type || fmt->num_components != num_components) + continue; + if ((fmt->caps & caps) != caps) + continue; + + // When specifying some particular host representation, ensure the + // format is non-opaque, ordered and unpadded + if (host_bits && fmt->opaque) + continue; + if (host_bits && fmt->texel_size * 8 != host_bits * num_components) + continue; + if (host_bits && !pl_fmt_is_ordered(fmt)) + continue; + + for (int i = 0; i < fmt->num_components; i++) { + if (fmt->component_depth[i] < min_depth) + goto next_fmt; + if (host_bits && fmt->host_bits[i] != host_bits) + goto next_fmt; + } + + return fmt; + +next_fmt: ; // equivalent to `continue` + } + + // ran out of formats + PL_TRACE(gpu, "No matching format found"); + return NULL; +} + +pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int comps) +{ + static const size_t sizes[] = { + [PL_FMT_FLOAT] = sizeof(float), + [PL_FMT_UNORM] = sizeof(unsigned), + [PL_FMT_UINT] = sizeof(unsigned), + [PL_FMT_SNORM] = sizeof(int), + [PL_FMT_SINT] = sizeof(int), + }; + + return pl_find_fmt(gpu, type, comps, 0, 8 * sizes[type], PL_FMT_CAP_VERTEX); +} + +pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name) +{ + if (!name) + return NULL; + + for (int i = 0; i < gpu->num_formats; i++) { + pl_fmt fmt = gpu->formats[i]; + if (strcmp(name, fmt->name) == 0) + return fmt; + } + + // ran out of formats + return NULL; +} + +pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc) +{ + if (!fourcc) + return NULL; + + for (int i = 0; i < gpu->num_formats; i++) { + pl_fmt fmt = gpu->formats[i]; + if (fourcc == fmt->fourcc) + return fmt; + } + + // ran out of formats + return NULL; +} + +static inline bool check_mod(pl_gpu gpu, pl_fmt fmt, uint64_t mod) +{ + for (int i = 0; i < fmt->num_modifiers; i++) { + if (fmt->modifiers[i] == mod) + return true; + } + + + PL_ERR(gpu, "DRM modifier %s not available for format %s. Available modifiers:", + PRINT_DRM_MOD(mod), fmt->name); + for (int i = 0; i < fmt->num_modifiers; i++) + PL_ERR(gpu, " %s", PRINT_DRM_MOD(fmt->modifiers[i])); + + return false; +} + +pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params) +{ + require(params->format); + require(!params->import_handle || !params->export_handle); + require(!params->import_handle || !params->initial_data); + if (params->export_handle) { + require(params->export_handle & gpu->export_caps.tex); + require(PL_ISPOT(params->export_handle)); + } + if (params->import_handle) { + require(params->import_handle & gpu->import_caps.tex); + require(PL_ISPOT(params->import_handle)); + if (params->import_handle == PL_HANDLE_DMA_BUF) { + if (!check_mod(gpu, params->format, params->shared_mem.drm_format_mod)) + goto error; + if (params->shared_mem.stride_w) + require(params->w && params->shared_mem.stride_w >= params->w); + if (params->shared_mem.stride_h) + require(params->h && params->shared_mem.stride_h >= params->h); + } else if (params->import_handle == PL_HANDLE_MTL_TEX) { + require(params->shared_mem.plane <= 2); + } + } + + switch (pl_tex_params_dimension(*params)) { + case 1: + require(params->w > 0); + require(params->w <= gpu->limits.max_tex_1d_dim); + require(!params->renderable); + require(!params->blit_src || gpu->limits.blittable_1d_3d); + require(!params->blit_dst || gpu->limits.blittable_1d_3d); + require(!params->format->num_planes); + break; + case 2: + require(params->w > 0 && params->h > 0); + require(params->w <= gpu->limits.max_tex_2d_dim); + require(params->h <= gpu->limits.max_tex_2d_dim); + break; + case 3: + require(params->w > 0 && params->h > 0 && params->d > 0); + require(params->w <= gpu->limits.max_tex_3d_dim); + require(params->h <= gpu->limits.max_tex_3d_dim); + require(params->d <= gpu->limits.max_tex_3d_dim); + require(!params->renderable); + require(!params->blit_src || gpu->limits.blittable_1d_3d); + require(!params->blit_dst || gpu->limits.blittable_1d_3d); + require(!params->format->num_planes); + break; + } + + enum pl_fmt_caps fmt_caps = params->format->caps; + bool fmt_opaque = params->format->opaque; + for (int i = 0; i < params->format->num_planes; i++) { + pl_fmt pfmt = params->format->planes[i].format; + fmt_caps |= pfmt->caps; + fmt_opaque &= pfmt->opaque; + } + + require(!params->host_readable || fmt_caps & PL_FMT_CAP_HOST_READABLE); + require(!params->host_writable || !fmt_opaque); + require(!params->sampleable || fmt_caps & PL_FMT_CAP_SAMPLEABLE); + require(!params->renderable || fmt_caps & PL_FMT_CAP_RENDERABLE); + require(!params->storable || fmt_caps & PL_FMT_CAP_STORABLE); + require(!params->blit_src || fmt_caps & PL_FMT_CAP_BLITTABLE); + require(!params->blit_dst || fmt_caps & PL_FMT_CAP_BLITTABLE); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_create(gpu, params); + +error: + if (params->debug_tag) + PL_ERR(gpu, " for texture: %s", params->debug_tag); + return NULL; +} + +void pl_tex_destroy(pl_gpu gpu, pl_tex *tex) +{ + if (!*tex) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->tex_destroy(gpu, *tex); + *tex = NULL; +} + +static bool pl_tex_params_superset(struct pl_tex_params a, struct pl_tex_params b) +{ + return a.w == b.w && a.h == b.h && a.d == b.d && + a.format == b.format && + (a.sampleable || !b.sampleable) && + (a.renderable || !b.renderable) && + (a.storable || !b.storable) && + (a.blit_src || !b.blit_src) && + (a.blit_dst || !b.blit_dst) && + (a.host_writable || !b.host_writable) && + (a.host_readable || !b.host_readable); +} + +bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params) +{ + if (params->initial_data) { + PL_ERR(gpu, "pl_tex_recreate may not be used with `initial_data`!"); + return false; + } + + if (params->import_handle) { + PL_ERR(gpu, "pl_tex_recreate may not be used with `import_handle`!"); + return false; + } + + if (*tex && pl_tex_params_superset((*tex)->params, *params)) { + pl_tex_invalidate(gpu, *tex); + return true; + } + + PL_DEBUG(gpu, "(Re)creating %dx%dx%d texture with format %s: %s", + params->w, params->h, params->d, params->format->name, + PL_DEF(params->debug_tag, "unknown")); + + pl_tex_destroy(gpu, tex); + *tex = pl_tex_create(gpu, params); + + return !!*tex; +} + +void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color) +{ + require(dst->params.blit_dst); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->tex_invalidate) + impl->tex_invalidate(gpu, dst); + impl->tex_clear_ex(gpu, dst, color); + return; + +error: + if (dst->params.debug_tag) + PL_ERR(gpu, " for texture: %s", dst->params.debug_tag); +} + +void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]) +{ + if (!pl_fmt_is_float(dst->params.format)) { + PL_ERR(gpu, "Cannot call `pl_tex_clear` on integer textures, please " + "use `pl_tex_clear_ex` instead."); + return; + } + + const union pl_clear_color col = { + .f = { color[0], color[1], color[2], color[3] }, + }; + + pl_tex_clear_ex(gpu, dst, col); +} + +void pl_tex_invalidate(pl_gpu gpu, pl_tex tex) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->tex_invalidate) + impl->tex_invalidate(gpu, tex); +} + +static void strip_coords(pl_tex tex, pl_rect3d *rc) +{ + if (!tex->params.d) { + rc->z0 = 0; + rc->z1 = 1; + } + + if (!tex->params.h) { + rc->y0 = 0; + rc->y1 = 1; + } +} + +static void infer_rc(pl_tex tex, pl_rect3d *rc) +{ + if (!rc->x0 && !rc->x1) + rc->x1 = tex->params.w; + if (!rc->y0 && !rc->y1) + rc->y1 = tex->params.h; + if (!rc->z0 && !rc->z1) + rc->z1 = tex->params.d; +} + +void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + pl_tex src = params->src, dst = params->dst; + require(src && dst); + pl_fmt src_fmt = src->params.format; + pl_fmt dst_fmt = dst->params.format; + require(src_fmt->internal_size == dst_fmt->internal_size); + require((src_fmt->type == PL_FMT_UINT) == (dst_fmt->type == PL_FMT_UINT)); + require((src_fmt->type == PL_FMT_SINT) == (dst_fmt->type == PL_FMT_SINT)); + require(src->params.blit_src); + require(dst->params.blit_dst); + require(params->sample_mode != PL_TEX_SAMPLE_LINEAR || (src_fmt->caps & PL_FMT_CAP_LINEAR)); + + struct pl_tex_blit_params fixed = *params; + infer_rc(src, &fixed.src_rc); + infer_rc(dst, &fixed.dst_rc); + strip_coords(src, &fixed.src_rc); + strip_coords(dst, &fixed.dst_rc); + + require(fixed.src_rc.x0 >= 0 && fixed.src_rc.x0 < src->params.w); + require(fixed.src_rc.x1 > 0 && fixed.src_rc.x1 <= src->params.w); + require(fixed.dst_rc.x0 >= 0 && fixed.dst_rc.x0 < dst->params.w); + require(fixed.dst_rc.x1 > 0 && fixed.dst_rc.x1 <= dst->params.w); + + if (src->params.h) { + require(fixed.src_rc.y0 >= 0 && fixed.src_rc.y0 < src->params.h); + require(fixed.src_rc.y1 > 0 && fixed.src_rc.y1 <= src->params.h); + } + + if (dst->params.h) { + require(fixed.dst_rc.y0 >= 0 && fixed.dst_rc.y0 < dst->params.h); + require(fixed.dst_rc.y1 > 0 && fixed.dst_rc.y1 <= dst->params.h); + } + + if (src->params.d) { + require(fixed.src_rc.z0 >= 0 && fixed.src_rc.z0 < src->params.d); + require(fixed.src_rc.z1 > 0 && fixed.src_rc.z1 <= src->params.d); + } + + if (dst->params.d) { + require(fixed.dst_rc.z0 >= 0 && fixed.dst_rc.z0 < dst->params.d); + require(fixed.dst_rc.z1 > 0 && fixed.dst_rc.z1 <= dst->params.d); + } + + pl_rect3d full = {0, 0, 0, dst->params.w, dst->params.h, dst->params.d}; + strip_coords(dst, &full); + + pl_rect3d rcnorm = fixed.dst_rc; + pl_rect3d_normalize(&rcnorm); + if (pl_rect3d_eq(rcnorm, full)) + pl_tex_invalidate(gpu, dst); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->tex_blit(gpu, &fixed); + return; + +error: + if (src->params.debug_tag || dst->params.debug_tag) { + PL_ERR(gpu, " for textures: src %s, dst %s", + PL_DEF(src->params.debug_tag, "(unknown)"), + PL_DEF(dst->params.debug_tag, "(unknown)")); + } +} + +static bool fix_tex_transfer(pl_gpu gpu, struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_rect3d rc = params->rc; + + // Infer the default values + infer_rc(tex, &rc); + strip_coords(tex, &rc); + + if (!params->row_pitch || !tex->params.w) + params->row_pitch = pl_rect_w(rc) * fmt->texel_size; + if (!params->depth_pitch || !tex->params.d) + params->depth_pitch = pl_rect_h(rc) * params->row_pitch; + + require(params->row_pitch); + require(params->depth_pitch); + params->rc = rc; + + // Check the parameters for sanity + switch (pl_tex_params_dimension(tex->params)) + { + case 3: + require(rc.z1 > rc.z0); + require(rc.z0 >= 0 && rc.z0 < tex->params.d); + require(rc.z1 > 0 && rc.z1 <= tex->params.d); + require(params->depth_pitch >= pl_rect_h(rc) * params->row_pitch); + require(params->depth_pitch % params->row_pitch == 0); + // fall through + case 2: + require(rc.y1 > rc.y0); + require(rc.y0 >= 0 && rc.y0 < tex->params.h); + require(rc.y1 > 0 && rc.y1 <= tex->params.h); + require(params->row_pitch >= pl_rect_w(rc) * fmt->texel_size); + require(params->row_pitch % fmt->texel_align == 0); + // fall through + case 1: + require(rc.x1 > rc.x0); + require(rc.x0 >= 0 && rc.x0 < tex->params.w); + require(rc.x1 > 0 && rc.x1 <= tex->params.w); + break; + } + + require(!params->buf ^ !params->ptr); // exactly one + if (params->buf) { + pl_buf buf = params->buf; + size_t size = pl_tex_transfer_size(params); + require(params->buf_offset + size >= params->buf_offset); // overflow check + require(params->buf_offset + size <= buf->params.size); + require(gpu->limits.buf_transfer); + } + + require(!params->callback || gpu->limits.callbacks); + return true; + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + require(tex->params.host_writable); + + struct pl_tex_transfer_params fixed = *params; + if (!fix_tex_transfer(gpu, &fixed)) + goto error; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_upload(gpu, &fixed); + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + pl_tex tex = params->tex; + require(tex->params.host_readable); + + struct pl_tex_transfer_params fixed = *params; + if (!fix_tex_transfer(gpu, &fixed)) + goto error; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_download(gpu, &fixed); + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t t) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_poll ? impl->tex_poll(gpu, tex, t) : false; +} + +pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + struct pl_buf_params params_rounded; + + require(!params->import_handle || !params->export_handle); + if (params->export_handle) { + require(PL_ISPOT(params->export_handle)); + require(params->export_handle & gpu->export_caps.buf); + } + if (params->import_handle) { + require(PL_ISPOT(params->import_handle)); + require(params->import_handle & gpu->import_caps.buf); + const struct pl_shared_mem *shmem = ¶ms->shared_mem; + require(shmem->offset + params->size <= shmem->size); + require(params->import_handle != PL_HANDLE_DMA_BUF || !shmem->drm_format_mod); + + // Fix misalignment on host pointer imports + if (params->import_handle == PL_HANDLE_HOST_PTR) { + uintptr_t page_mask = ~(gpu->limits.align_host_ptr - 1); + uintptr_t ptr_base = (uintptr_t) shmem->handle.ptr & page_mask; + size_t ptr_offset = (uintptr_t) shmem->handle.ptr - ptr_base; + size_t buf_offset = ptr_offset + shmem->offset; + size_t ptr_size = PL_ALIGN2(ptr_offset + shmem->size, + gpu->limits.align_host_ptr); + + if (ptr_base != (uintptr_t) shmem->handle.ptr || ptr_size > shmem->size) { + static bool warned_rounding = false; + if (!warned_rounding) { + warned_rounding = true; + PL_WARN(gpu, "Imported host pointer is not page-aligned. " + "This should normally be fine on most platforms, " + "but may cause issues in some rare circumstances."); + } + + PL_TRACE(gpu, "Rounding imported host pointer %p + %zu -> %zu to " + "nearest page boundaries: %p + %zu -> %zu", + shmem->handle.ptr, shmem->offset, shmem->size, + (void *) ptr_base, buf_offset, ptr_size); + } + + params_rounded = *params; + params_rounded.shared_mem.handle.ptr = (void *) ptr_base; + params_rounded.shared_mem.offset = buf_offset; + params_rounded.shared_mem.size = ptr_size; + params = ¶ms_rounded; + } + } + + require(params->size > 0 && params->size <= gpu->limits.max_buf_size); + require(!params->uniform || params->size <= gpu->limits.max_ubo_size); + require(!params->storable || params->size <= gpu->limits.max_ssbo_size); + require(!params->drawable || params->size <= gpu->limits.max_vbo_size); + require(!params->host_mapped || params->size <= gpu->limits.max_mapped_size); + + if (params->format) { + pl_fmt fmt = params->format; + require(params->size <= gpu->limits.max_buffer_texels * fmt->texel_size); + require(!params->uniform || (fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM)); + require(!params->storable || (fmt->caps & PL_FMT_CAP_TEXEL_STORAGE)); + } + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + pl_buf buf = impl->buf_create(gpu, params); + if (buf) + require(!params->host_mapped || buf->data); + + return buf; + +error: + if (params->debug_tag) + PL_ERR(gpu, " for buffer: %s", params->debug_tag); + return NULL; +} + +void pl_buf_destroy(pl_gpu gpu, pl_buf *buf) +{ + if (!*buf) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->buf_destroy(gpu, *buf); + *buf = NULL; +} + +static bool pl_buf_params_superset(struct pl_buf_params a, struct pl_buf_params b) +{ + return a.size >= b.size && + a.memory_type == b.memory_type && + a.format == b.format && + (a.host_writable || !b.host_writable) && + (a.host_readable || !b.host_readable) && + (a.host_mapped || !b.host_mapped) && + (a.uniform || !b.uniform) && + (a.storable || !b.storable) && + (a.drawable || !b.drawable); +} + +bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params) +{ + + if (params->initial_data) { + PL_ERR(gpu, "pl_buf_recreate may not be used with `initial_data`!"); + return false; + } + + if (*buf && pl_buf_params_superset((*buf)->params, *params)) + return true; + + PL_INFO(gpu, "(Re)creating %zu buffer", params->size); + pl_buf_destroy(gpu, buf); + *buf = pl_buf_create(gpu, params); + + return !!*buf; +} + +void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset, + const void *data, size_t size) +{ + require(buf->params.host_writable); + require(buf_offset + size <= buf->params.size); + require(buf_offset == PL_ALIGN2(buf_offset, 4)); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->buf_write(gpu, buf, buf_offset, data, size); + return; + +error: + if (buf->params.debug_tag) + PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag); +} + +bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset, + void *dest, size_t size) +{ + require(buf->params.host_readable); + require(buf_offset + size <= buf->params.size); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->buf_read(gpu, buf, buf_offset, dest, size); + +error: + if (buf->params.debug_tag) + PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag); + return false; +} + +void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size) +{ + require(src_offset + size <= src->params.size); + require(dst_offset + size <= dst->params.size); + require(src != dst); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->buf_copy(gpu, dst, dst_offset, src, src_offset, size); + return; + +error: + if (src->params.debug_tag || dst->params.debug_tag) { + PL_ERR(gpu, " for buffers: src %s, dst %s", + src->params.debug_tag, dst->params.debug_tag); + } +} + +bool pl_buf_export(pl_gpu gpu, pl_buf buf) +{ + require(buf->params.export_handle || buf->params.import_handle); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->buf_export(gpu, buf); + +error: + if (buf->params.debug_tag) + PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag); + return false; +} + +bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t t) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->buf_poll ? impl->buf_poll(gpu, buf, t) : false; +} + +size_t pl_var_type_size(enum pl_var_type type) +{ + switch (type) { + case PL_VAR_SINT: return sizeof(int); + case PL_VAR_UINT: return sizeof(unsigned int); + case PL_VAR_FLOAT: return sizeof(float); + case PL_VAR_INVALID: // fall through + case PL_VAR_TYPE_COUNT: break; + } + + pl_unreachable(); +} + +#define PL_VAR(TYPE, NAME, M, V) \ + struct pl_var pl_var_##NAME(const char *name) { \ + return (struct pl_var) { \ + .name = name, \ + .type = PL_VAR_##TYPE, \ + .dim_m = M, \ + .dim_v = V, \ + .dim_a = 1, \ + }; \ + } + +PL_VAR(FLOAT, float, 1, 1) +PL_VAR(FLOAT, vec2, 1, 2) +PL_VAR(FLOAT, vec3, 1, 3) +PL_VAR(FLOAT, vec4, 1, 4) +PL_VAR(FLOAT, mat2, 2, 2) +PL_VAR(FLOAT, mat2x3, 2, 3) +PL_VAR(FLOAT, mat2x4, 2, 4) +PL_VAR(FLOAT, mat3, 3, 3) +PL_VAR(FLOAT, mat3x4, 3, 4) +PL_VAR(FLOAT, mat4x2, 4, 2) +PL_VAR(FLOAT, mat4x3, 4, 3) +PL_VAR(FLOAT, mat4, 4, 4) +PL_VAR(SINT, int, 1, 1) +PL_VAR(SINT, ivec2, 1, 2) +PL_VAR(SINT, ivec3, 1, 3) +PL_VAR(SINT, ivec4, 1, 4) +PL_VAR(UINT, uint, 1, 1) +PL_VAR(UINT, uvec2, 1, 2) +PL_VAR(UINT, uvec3, 1, 3) +PL_VAR(UINT, uvec4, 1, 4) + +#undef PL_VAR + +const struct pl_named_var pl_var_glsl_types[] = { + // float vectors + { "float", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }}, + { "vec2", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }}, + { "vec3", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }}, + { "vec4", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }}, + // float matrices + { "mat2", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 2, .dim_a = 1, }}, + { "mat2x3", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 3, .dim_a = 1, }}, + { "mat2x4", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 4, .dim_a = 1, }}, + { "mat3", { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 3, .dim_a = 1, }}, + { "mat3x4", { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 4, .dim_a = 1, }}, + { "mat4x2", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 2, .dim_a = 1, }}, + { "mat4x3", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 3, .dim_a = 1, }}, + { "mat4", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 4, .dim_a = 1, }}, + // integer vectors + { "int", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }}, + { "ivec2", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }}, + { "ivec3", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }}, + { "ivec4", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }}, + // unsigned integer vectors + { "uint", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }}, + { "uvec2", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }}, + { "uvec3", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }}, + { "uvec4", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }}, + + {0}, +}; + +#define MAX_DIM 4 + +const char *pl_var_glsl_type_name(struct pl_var var) +{ + static const char *types[PL_VAR_TYPE_COUNT][MAX_DIM+1][MAX_DIM+1] = { + // float vectors + [PL_VAR_FLOAT][1][1] = "float", + [PL_VAR_FLOAT][1][2] = "vec2", + [PL_VAR_FLOAT][1][3] = "vec3", + [PL_VAR_FLOAT][1][4] = "vec4", + // float matrices + [PL_VAR_FLOAT][2][2] = "mat2", + [PL_VAR_FLOAT][2][3] = "mat2x3", + [PL_VAR_FLOAT][2][4] = "mat2x4", + [PL_VAR_FLOAT][3][2] = "mat3x2", + [PL_VAR_FLOAT][3][3] = "mat3", + [PL_VAR_FLOAT][3][4] = "mat3x4", + [PL_VAR_FLOAT][4][2] = "mat4x2", + [PL_VAR_FLOAT][4][3] = "mat4x3", + [PL_VAR_FLOAT][4][4] = "mat4", + // integer vectors + [PL_VAR_SINT][1][1] = "int", + [PL_VAR_SINT][1][2] = "ivec2", + [PL_VAR_SINT][1][3] = "ivec3", + [PL_VAR_SINT][1][4] = "ivec4", + // unsigned integer vectors + [PL_VAR_UINT][1][1] = "uint", + [PL_VAR_UINT][1][2] = "uvec2", + [PL_VAR_UINT][1][3] = "uvec3", + [PL_VAR_UINT][1][4] = "uvec4", + }; + + if (var.dim_v > MAX_DIM || var.dim_m > MAX_DIM) + return NULL; + + return types[var.type][var.dim_m][var.dim_v]; +} + +struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name) +{ + static const enum pl_var_type vartypes[] = { + [PL_FMT_FLOAT] = PL_VAR_FLOAT, + [PL_FMT_UNORM] = PL_VAR_FLOAT, + [PL_FMT_SNORM] = PL_VAR_FLOAT, + [PL_FMT_UINT] = PL_VAR_UINT, + [PL_FMT_SINT] = PL_VAR_SINT, + }; + + pl_assert(fmt->type < PL_ARRAY_SIZE(vartypes)); + return (struct pl_var) { + .type = vartypes[fmt->type], + .name = name, + .dim_v = fmt->num_components, + .dim_m = 1, + .dim_a = 1, + }; +} + +struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var) +{ + size_t col_size = pl_var_type_size(var->type) * var->dim_v; + return (struct pl_var_layout) { + .offset = offset, + .stride = col_size, + .size = col_size * var->dim_m * var->dim_a, + }; +} + +struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var) +{ + size_t el_size = pl_var_type_size(var->type); + + // std140 packing rules: + // 1. The size of generic values is their size in bytes + // 2. The size of vectors is the vector length * the base count + // 3. Matrices are treated like arrays of column vectors + // 4. The size of array rows is that of the element size rounded up to + // the nearest multiple of vec4 + // 5. All values are aligned to a multiple of their size (stride for arrays), + // with the exception of vec3 which is aligned like vec4 + size_t stride = el_size * var->dim_v; + size_t align = stride; + if (var->dim_v == 3) + align += el_size; + if (var->dim_m * var->dim_a > 1) + stride = align = PL_ALIGN2(align, sizeof(float[4])); + + return (struct pl_var_layout) { + .offset = PL_ALIGN2(offset, align), + .stride = stride, + .size = stride * var->dim_m * var->dim_a, + }; +} + +struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var) +{ + size_t el_size = pl_var_type_size(var->type); + + // std430 packing rules: like std140, except arrays/matrices are always + // "tightly" packed, even arrays/matrices of vec3s + size_t stride = el_size * var->dim_v; + size_t align = stride; + if (var->dim_v == 3) + align += el_size; + if (var->dim_m * var->dim_a > 1) + stride = align; + + return (struct pl_var_layout) { + .offset = PL_ALIGN2(offset, align), + .stride = stride, + .size = stride * var->dim_m * var->dim_a, + }; +} + +void memcpy_layout(void *dst_p, struct pl_var_layout dst_layout, + const void *src_p, struct pl_var_layout src_layout) +{ + uintptr_t src = (uintptr_t) src_p + src_layout.offset; + uintptr_t dst = (uintptr_t) dst_p + dst_layout.offset; + + if (src_layout.stride == dst_layout.stride) { + pl_assert(dst_layout.size == src_layout.size); + memcpy((void *) dst, (const void *) src, src_layout.size); + return; + } + + size_t stride = PL_MIN(src_layout.stride, dst_layout.stride); + uintptr_t end = src + src_layout.size; + while (src < end) { + pl_assert(dst < dst + dst_layout.size); + memcpy((void *) dst, (const void *) src, stride); + src += src_layout.stride; + dst += dst_layout.stride; + } +} + +int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + int ret = impl->desc_namespace(gpu, type); + pl_assert(ret >= 0 && ret < PL_DESC_TYPE_COUNT); + return ret; +} + +const char *pl_desc_access_glsl_name(enum pl_desc_access mode) +{ + switch (mode) { + case PL_DESC_ACCESS_READWRITE: return ""; + case PL_DESC_ACCESS_READONLY: return "readonly"; + case PL_DESC_ACCESS_WRITEONLY: return "writeonly"; + case PL_DESC_ACCESS_COUNT: break; + } + + pl_unreachable(); +} + +const struct pl_blend_params pl_alpha_overlay = { + .src_rgb = PL_BLEND_SRC_ALPHA, + .dst_rgb = PL_BLEND_ONE_MINUS_SRC_ALPHA, + .src_alpha = PL_BLEND_ONE, + .dst_alpha = PL_BLEND_ONE_MINUS_SRC_ALPHA, +}; + +static inline void log_shader_sources(pl_log log, enum pl_log_level level, + const struct pl_pass_params *params) +{ + if (!pl_msg_test(log, level) || !params->glsl_shader) + return; + + switch (params->type) { + case PL_PASS_RASTER: + if (!params->vertex_shader) + return; + pl_msg(log, level, "vertex shader source:"); + pl_msg_source(log, level, params->vertex_shader); + pl_msg(log, level, "fragment shader source:"); + pl_msg_source(log, level, params->glsl_shader); + return; + + case PL_PASS_COMPUTE: + pl_msg(log, level, "compute shader source:"); + pl_msg_source(log, level, params->glsl_shader); + return; + + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void log_spec_constants(pl_log log, enum pl_log_level lev, + const struct pl_pass_params *params, + const void *constant_data) +{ + if (!constant_data || !params->num_constants || !pl_msg_test(log, lev)) + return; + + pl_msg(log, lev, "Specialization constant values:"); + + uintptr_t data_base = (uintptr_t) constant_data; + for (int i = 0; i < params->num_constants; i++) { + union { + int i; + unsigned u; + float f; + } *data = (void *) (data_base + params->constants[i].offset); + int id = params->constants[i].id; + + switch (params->constants[i].type) { + case PL_VAR_SINT: pl_msg(log, lev, " constant_id=%d: %d", id, data->i); break; + case PL_VAR_UINT: pl_msg(log, lev, " constant_id=%d: %u", id, data->u); break; + case PL_VAR_FLOAT: pl_msg(log, lev, " constant_id=%d: %f", id, data->f); break; + default: pl_unreachable(); + } + } +} + +pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params) +{ + require(params->glsl_shader); + switch(params->type) { + case PL_PASS_RASTER: + require(params->vertex_shader); + require(params->vertex_stride % gpu->limits.align_vertex_stride == 0); + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct pl_vertex_attrib va = params->vertex_attribs[i]; + require(va.name); + require(va.fmt); + require(va.fmt->caps & PL_FMT_CAP_VERTEX); + require(va.offset + va.fmt->texel_size <= params->vertex_stride); + } + + require(params->target_format); + require(params->target_format->caps & PL_FMT_CAP_RENDERABLE); + require(!params->blend_params || params->target_format->caps & PL_FMT_CAP_BLENDABLE); + require(!params->blend_params || params->load_target); + break; + case PL_PASS_COMPUTE: + require(gpu->glsl.compute); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + size_t num_var_comps = 0; + for (int i = 0; i < params->num_variables; i++) { + struct pl_var var = params->variables[i]; + num_var_comps += var.dim_v * var.dim_m * var.dim_a; + require(var.name); + require(pl_var_glsl_type_name(var)); + } + require(num_var_comps <= gpu->limits.max_variable_comps); + + require(params->num_constants <= gpu->limits.max_constants); + for (int i = 0; i < params->num_constants; i++) + require(params->constants[i].type); + + for (int i = 0; i < params->num_descriptors; i++) { + struct pl_desc desc = params->descriptors[i]; + require(desc.name); + + // enforce disjoint descriptor bindings for each namespace + int namespace = pl_desc_namespace(gpu, desc.type); + for (int j = i+1; j < params->num_descriptors; j++) { + struct pl_desc other = params->descriptors[j]; + require(desc.binding != other.binding || + namespace != pl_desc_namespace(gpu, other.type)); + } + } + + require(params->push_constants_size <= gpu->limits.max_pushc_size); + require(params->push_constants_size == PL_ALIGN2(params->push_constants_size, 4)); + + log_shader_sources(gpu->log, PL_LOG_DEBUG, params); + log_spec_constants(gpu->log, PL_LOG_DEBUG, params, params->constant_data); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + pl_pass pass = impl->pass_create(gpu, params); + if (!pass) + goto error; + + return pass; + +error: + log_shader_sources(gpu->log, PL_LOG_ERR, params); + pl_log_stack_trace(gpu->log, PL_LOG_ERR); + pl_debug_abort(); + return NULL; +} + +void pl_pass_destroy(pl_gpu gpu, pl_pass *pass) +{ + if (!*pass) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->pass_destroy(gpu, *pass); + *pass = NULL; +} + +void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + pl_pass pass = params->pass; + struct pl_pass_run_params new = *params; + + for (int i = 0; i < pass->params.num_descriptors; i++) { + struct pl_desc desc = pass->params.descriptors[i]; + struct pl_desc_binding db = params->desc_bindings[i]; + require(db.object); + switch (desc.type) { + case PL_DESC_SAMPLED_TEX: { + pl_tex tex = db.object; + pl_fmt fmt = tex->params.format; + require(tex->params.sampleable); + require(db.sample_mode != PL_TEX_SAMPLE_LINEAR || (fmt->caps & PL_FMT_CAP_LINEAR)); + break; + } + case PL_DESC_STORAGE_IMG: { + pl_tex tex = db.object; + pl_fmt fmt = tex->params.format; + require(tex->params.storable); + require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE)); + break; + } + case PL_DESC_BUF_UNIFORM: { + pl_buf buf = db.object; + require(buf->params.uniform); + break; + } + case PL_DESC_BUF_STORAGE: { + pl_buf buf = db.object; + require(buf->params.storable); + break; + } + case PL_DESC_BUF_TEXEL_UNIFORM: { + pl_buf buf = db.object; + require(buf->params.uniform && buf->params.format); + break; + } + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = db.object; + pl_fmt fmt = buf->params.format; + require(buf->params.storable && buf->params.format); + require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE)); + break; + } + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + } + + for (int i = 0; i < params->num_var_updates; i++) { + struct pl_var_update vu = params->var_updates[i]; + require(vu.index >= 0 && vu.index < pass->params.num_variables); + require(vu.data); + } + + require(params->push_constants || !pass->params.push_constants_size); + + switch (pass->params.type) { + case PL_PASS_RASTER: { + switch (pass->params.vertex_type) { + case PL_PRIM_TRIANGLE_LIST: + require(params->vertex_count % 3 == 0); + // fall through + case PL_PRIM_TRIANGLE_STRIP: + require(params->vertex_count >= 3); + break; + case PL_PRIM_TYPE_COUNT: + pl_unreachable(); + } + + require(!params->vertex_data ^ !params->vertex_buf); + if (params->vertex_buf) { + pl_buf vertex_buf = params->vertex_buf; + require(vertex_buf->params.drawable); + if (!params->index_data && !params->index_buf) { + // Cannot bounds check indexed draws + size_t vert_size = params->vertex_count * pass->params.vertex_stride; + require(params->buf_offset + vert_size <= vertex_buf->params.size); + } + } + + require(!params->index_data || !params->index_buf); + if (params->index_buf) { + pl_buf index_buf = params->index_buf; + require(!params->vertex_data); + require(index_buf->params.drawable); + size_t index_size = pl_index_buf_size(params); + require(params->index_offset + index_size <= index_buf->params.size); + } + + pl_tex target = params->target; + require(target); + require(pl_tex_params_dimension(target->params) == 2); + require(target->params.format->signature == pass->params.target_format->signature); + require(target->params.renderable); + pl_rect2d *vp = &new.viewport; + pl_rect2d *sc = &new.scissors; + + // Sanitize viewport/scissors + if (!vp->x0 && !vp->x1) + vp->x1 = target->params.w; + if (!vp->y0 && !vp->y1) + vp->y1 = target->params.h; + + if (!sc->x0 && !sc->x1) + sc->x1 = target->params.w; + if (!sc->y0 && !sc->y1) + sc->y1 = target->params.h; + + // Constrain the scissors to the target dimension (to sanitize the + // underlying graphics API calls) + sc->x0 = PL_CLAMP(sc->x0, 0, target->params.w); + sc->y0 = PL_CLAMP(sc->y0, 0, target->params.h); + sc->x1 = PL_CLAMP(sc->x1, 0, target->params.w); + sc->y1 = PL_CLAMP(sc->y1, 0, target->params.h); + + // Scissors wholly outside target -> silently drop pass (also needed + // to ensure we don't cause UB by specifying invalid scissors) + if (!pl_rect_w(*sc) || !pl_rect_h(*sc)) + return; + + require(pl_rect_w(*vp) > 0); + require(pl_rect_h(*vp) > 0); + require(pl_rect_w(*sc) > 0); + require(pl_rect_h(*sc) > 0); + + if (!pass->params.load_target) + pl_tex_invalidate(gpu, target); + break; + } + case PL_PASS_COMPUTE: + for (int i = 0; i < PL_ARRAY_SIZE(params->compute_groups); i++) { + require(params->compute_groups[i] >= 0); + require(params->compute_groups[i] <= gpu->limits.max_dispatch[i]); + } + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->pass_run(gpu, &new); + +error: + return; +} + +void pl_gpu_flush(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->gpu_flush) + impl->gpu_flush(gpu); +} + +void pl_gpu_finish(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->gpu_finish(gpu); +} + +bool pl_gpu_is_failed(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (!impl->gpu_is_failed) + return false; + + return impl->gpu_is_failed(gpu); +} + +pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type) +{ + require(handle_type); + require(handle_type & gpu->export_caps.sync); + require(PL_ISPOT(handle_type)); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->sync_create(gpu, handle_type); + +error: + return NULL; +} + +void pl_sync_destroy(pl_gpu gpu, pl_sync *sync) +{ + if (!*sync) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->sync_destroy(gpu, *sync); + *sync = NULL; +} + +bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync) +{ + require(tex->params.import_handle || tex->params.export_handle); + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->tex_export(gpu, tex, sync); + +error: + if (tex->params.debug_tag) + PL_ERR(gpu, " for texture: %s", tex->params.debug_tag); + return false; +} + +pl_timer pl_timer_create(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (!impl->timer_create) + return NULL; + + return impl->timer_create(gpu); +} + +void pl_timer_destroy(pl_gpu gpu, pl_timer *timer) +{ + if (!*timer) + return; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + impl->timer_destroy(gpu, *timer); + *timer = NULL; +} + +uint64_t pl_timer_query(pl_gpu gpu, pl_timer timer) +{ + if (!timer) + return 0; + + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + return impl->timer_query(gpu, timer); +} diff --git a/src/gpu.h b/src/gpu.h new file mode 100644 index 0000000..e915a50 --- /dev/null +++ b/src/gpu.h @@ -0,0 +1,207 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" +#include "log.h" + +#include <libplacebo/gpu.h> +#include <libplacebo/dispatch.h> + +// To avoid having to include drm_fourcc.h +#ifndef DRM_FORMAT_MOD_LINEAR +#define DRM_FORMAT_MOD_LINEAR UINT64_C(0x0) +#define DRM_FORMAT_MOD_INVALID ((UINT64_C(1) << 56) - 1) +#endif + +// This struct must be the first member of the gpu's priv struct. The `pl_gpu` +// helpers will cast the priv struct to this struct! + +#define GPU_PFN(name) __typeof__(pl_##name) *name +struct pl_gpu_fns { + // This is a pl_dispatch used (on the pl_gpu itself!) for the purposes of + // dispatching compute shaders for performing various emulation tasks (e.g. + // partial clears, blits or emulated texture transfers, see below). + // + // Warning: Care must be taken to avoid recursive calls. + pl_dispatch dp; + + // Internal cache, or NULL. Set by the user (via pl_gpu_set_cache). + _Atomic(pl_cache) cache; + + // Destructors: These also free the corresponding objects, but they + // must not be called on NULL. (The NULL checks are done by the pl_*_destroy + // wrappers) + void (*destroy)(pl_gpu gpu); + void (*tex_destroy)(pl_gpu, pl_tex); + void (*buf_destroy)(pl_gpu, pl_buf); + void (*pass_destroy)(pl_gpu, pl_pass); + void (*sync_destroy)(pl_gpu, pl_sync); + void (*timer_destroy)(pl_gpu, pl_timer); + + GPU_PFN(tex_create); + GPU_PFN(tex_invalidate); // optional + GPU_PFN(tex_clear_ex); // optional if no blittable formats + GPU_PFN(tex_blit); // optional if no blittable formats + GPU_PFN(tex_upload); + GPU_PFN(tex_download); + GPU_PFN(tex_poll); // optional: if NULL, textures are always free to use + GPU_PFN(buf_create); + GPU_PFN(buf_write); + GPU_PFN(buf_read); + GPU_PFN(buf_copy); + GPU_PFN(buf_export); // optional if !gpu->export_caps.buf + GPU_PFN(buf_poll); // optional: if NULL, buffers are always free to use + GPU_PFN(desc_namespace); + GPU_PFN(pass_create); + GPU_PFN(pass_run); + GPU_PFN(sync_create); // optional if !gpu->export_caps.sync + GPU_PFN(tex_export); // optional if !gpu->export_caps.sync + GPU_PFN(timer_create); // optional + GPU_PFN(timer_query); // optional + GPU_PFN(gpu_flush); // optional + GPU_PFN(gpu_finish); + GPU_PFN(gpu_is_failed); // optional +}; +#undef GPU_PFN + +// All resources such as textures and buffers allocated from the GPU must be +// destroyed before calling pl_destroy. +void pl_gpu_destroy(pl_gpu gpu); + +// Returns true if the device supports interop. This is considered to be +// the case if at least one of `gpu->export/import_caps` is nonzero. +static inline bool pl_gpu_supports_interop(pl_gpu gpu) +{ + return gpu->export_caps.tex || + gpu->import_caps.tex || + gpu->export_caps.buf || + gpu->import_caps.buf || + gpu->export_caps.sync || + gpu->import_caps.sync; +} + +// Returns the GPU-internal `pl_dispatch` and `pl_cache` objects. +pl_dispatch pl_gpu_dispatch(pl_gpu gpu); +pl_cache pl_gpu_cache(pl_gpu gpu); + +// GPU-internal helpers: these should not be used outside of GPU implementations + +// This performs several tasks. It sorts the format list, logs GPU metadata, +// performs verification and fixes up backwards compatibility fields. This +// should be returned as the last step when creating a `pl_gpu`. +pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu); + +// Look up the right GLSL image format qualifier from a partially filled-in +// pl_fmt, or NULL if the format does not have a legal matching GLSL name. +// +// `components` may differ from fmt->num_components (for emulated formats) +const char *pl_fmt_glsl_format(pl_fmt fmt, int components); + +// Look up the right fourcc from a partially filled-in pl_fmt, or 0 if the +// format does not have a legal matching fourcc format. +uint32_t pl_fmt_fourcc(pl_fmt fmt); + +// Compute the total size (in bytes) of a texture transfer operation +size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par); + +// Split a tex transfer into slices. For emulated formats, `texel_fmt` gives +// the format of the underlying texel buffer. +// +// Returns the number of slices, or 0 on error (e.g. no SSBOs available). +// `out_slices` must be freed by caller (on success). +int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt, + const struct pl_tex_transfer_params *params, + struct pl_tex_transfer_params **out_slices); + +// Helper that wraps pl_tex_upload/download using texture upload buffers to +// ensure that params->buf is always set. +bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params); +bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params); + +// This requires that params.buf has been set and is of type PL_BUF_TEXEL_* +bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params); +bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params); + +// Both `src` and `dst must be storable. `src` must also be sampleable, if the +// blit requires linear sampling. Returns false if these conditions are unmet. +bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params); + +// Helper to do a 2D blit with stretch and scale using a raster pass +void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params); + +// Helper for GPU-accelerated endian swapping +// +// Note: `src` and `dst` can be the same buffer, for an in-place operation. In +// this case, `src_offset` and `dst_offset` must be the same. +struct pl_buf_copy_swap_params { + // Source of the copy operation. Must be `storable`. + pl_buf src; + size_t src_offset; + + // Destination of the copy operation. Must be `storable`. + pl_buf dst; + size_t dst_offset; + + // Number of bytes to copy. Must be a multiple of 4. + size_t size; + + // Underlying word size. Must be 2 (for 16-bit swap) or 4 (for 32-bit swap) + int wordsize; +}; + +bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params); + +void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params); + +// Make a deep-copy of the pass params. Note: cached_program etc. are not +// copied, but cleared explicitly. +struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params); + +// Helper to compute the size of an index buffer +static inline size_t pl_index_buf_size(const struct pl_pass_run_params *params) +{ + switch (params->index_fmt) { + case PL_INDEX_UINT16: return params->vertex_count * sizeof(uint16_t); + case PL_INDEX_UINT32: return params->vertex_count * sizeof(uint32_t); + case PL_INDEX_FORMAT_COUNT: break; + } + + pl_unreachable(); +} + +// Helper to compute the size of a vertex buffer required to fit all indices +size_t pl_vertex_buf_size(const struct pl_pass_run_params *params); + +// Utility function for pretty-printing UUIDs +#define UUID_SIZE 16 +#define PRINT_UUID(uuid) (print_uuid((char[3 * UUID_SIZE]){0}, (uuid))) +const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE]); + +// Helper to pretty-print fourcc codes +#define PRINT_FOURCC(fcc) \ + (!(fcc) ? "" : (char[5]) { \ + (fcc) & 0xFF, \ + ((fcc) >> 8) & 0xFF, \ + ((fcc) >> 16) & 0xFF, \ + ((fcc) >> 24) & 0xFF \ + }) + +#define DRM_MOD_SIZE 26 +#define PRINT_DRM_MOD(mod) (print_drm_mod((char[DRM_MOD_SIZE]){0}, (mod))) +const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod); diff --git a/src/gpu/utils.c b/src/gpu/utils.c new file mode 100644 index 0000000..40ca84d --- /dev/null +++ b/src/gpu/utils.c @@ -0,0 +1,1288 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" +#include "shaders.h" +#include "gpu.h" + +// GPU-internal helpers + +static int cmp_fmt(const void *pa, const void *pb) +{ + pl_fmt a = *(pl_fmt *)pa; + pl_fmt b = *(pl_fmt *)pb; + + // Always prefer non-opaque formats + if (a->opaque != b->opaque) + return PL_CMP(a->opaque, b->opaque); + + // Always prefer non-emulated formats + if (a->emulated != b->emulated) + return PL_CMP(a->emulated, b->emulated); + + int ca = __builtin_popcount(a->caps), + cb = __builtin_popcount(b->caps); + if (ca != cb) + return -PL_CMP(ca, cb); // invert to sort higher values first + + // If the population count is the same but the caps are different, prefer + // the caps with a "lower" value (which tend to be more fundamental caps) + if (a->caps != b->caps) + return PL_CMP(a->caps, b->caps); + + // If the capabilities are equal, sort based on the component attributes + for (int i = 0; i < PL_ARRAY_SIZE(a->component_depth); i++) { + int da = a->component_depth[i], + db = b->component_depth[i]; + if (da != db) + return PL_CMP(da, db); + + int ha = a->host_bits[i], + hb = b->host_bits[i]; + if (ha != hb) + return PL_CMP(ha, hb); + + int oa = a->sample_order[i], + ob = b->sample_order[i]; + if (oa != ob) + return PL_CMP(oa, ob); + } + + // Fall back to sorting by the name (for stability) + return strcmp(a->name, b->name); +} + +#define FMT_BOOL(letter, cap) ((cap) ? (letter) : '-') +#define FMT_IDX4(f) (f)[0], (f)[1], (f)[2], (f)[3] + +static void print_formats(pl_gpu gpu) +{ + if (!pl_msg_test(gpu->log, PL_LOG_DEBUG)) + return; + +#define CAP_HEADER "%-12s" +#define CAP_FIELDS "%c%c%c%c%c%c%c%c%c%c%c%c" +#define CAP_VALUES \ + FMT_BOOL('S', fmt->caps & PL_FMT_CAP_SAMPLEABLE), \ + FMT_BOOL('s', fmt->caps & PL_FMT_CAP_STORABLE), \ + FMT_BOOL('L', fmt->caps & PL_FMT_CAP_LINEAR), \ + FMT_BOOL('R', fmt->caps & PL_FMT_CAP_RENDERABLE), \ + FMT_BOOL('b', fmt->caps & PL_FMT_CAP_BLENDABLE), \ + FMT_BOOL('B', fmt->caps & PL_FMT_CAP_BLITTABLE), \ + FMT_BOOL('V', fmt->caps & PL_FMT_CAP_VERTEX), \ + FMT_BOOL('u', fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM), \ + FMT_BOOL('t', fmt->caps & PL_FMT_CAP_TEXEL_STORAGE), \ + FMT_BOOL('H', fmt->caps & PL_FMT_CAP_HOST_READABLE), \ + FMT_BOOL('W', fmt->caps & PL_FMT_CAP_READWRITE), \ + FMT_BOOL('G', fmt->gatherable) + + PL_DEBUG(gpu, "GPU texture formats:"); + PL_DEBUG(gpu, " %-20s %-6s %-4s %-4s " CAP_HEADER " %-3s %-13s %-13s %-10s %-10s %-6s", + "NAME", "TYPE", "SIZE", "COMP", "CAPS", "EMU", "DEPTH", "HOST_BITS", + "GLSL_TYPE", "GLSL_FMT", "FOURCC"); + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + + static const char *types[] = { + [PL_FMT_UNKNOWN] = "UNKNOWN", + [PL_FMT_UNORM] = "UNORM", + [PL_FMT_SNORM] = "SNORM", + [PL_FMT_UINT] = "UINT", + [PL_FMT_SINT] = "SINT", + [PL_FMT_FLOAT] = "FLOAT", + }; + + static const char idx_map[4] = {'R', 'G', 'B', 'A'}; + char indices[4] = {' ', ' ', ' ', ' '}; + if (!fmt->opaque) { + for (int i = 0; i < fmt->num_components; i++) + indices[i] = idx_map[fmt->sample_order[i]]; + } + + + PL_DEBUG(gpu, " %-20s %-6s %-4zu %c%c%c%c " CAP_FIELDS " %-3s " + "{%-2d %-2d %-2d %-2d} {%-2d %-2d %-2d %-2d} %-10s %-10s %-6s", + fmt->name, types[fmt->type], fmt->texel_size, + FMT_IDX4(indices), CAP_VALUES, fmt->emulated ? "y" : "n", + FMT_IDX4(fmt->component_depth), FMT_IDX4(fmt->host_bits), + PL_DEF(fmt->glsl_type, ""), PL_DEF(fmt->glsl_format, ""), + PRINT_FOURCC(fmt->fourcc)); + +#undef CAP_HEADER +#undef CAP_FIELDS +#undef CAP_VALUES + + for (int i = 0; i < fmt->num_modifiers; i++) { + PL_TRACE(gpu, " modifiers[%d]: %s", + i, PRINT_DRM_MOD(fmt->modifiers[i])); + } + } +} + +pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu) +{ + // Sort formats + qsort(gpu->formats, gpu->num_formats, sizeof(pl_fmt), cmp_fmt); + + // Verification + pl_assert(gpu->limits.max_tex_2d_dim); + pl_assert(gpu->limits.max_variable_comps || gpu->limits.max_ubo_size); + pl_assert(gpu->limits.max_ubo_size <= gpu->limits.max_buf_size); + pl_assert(gpu->limits.max_ssbo_size <= gpu->limits.max_buf_size); + pl_assert(gpu->limits.max_vbo_size <= gpu->limits.max_buf_size); + pl_assert(gpu->limits.max_mapped_size <= gpu->limits.max_buf_size); + + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + pl_assert(fmt->name); + pl_assert(fmt->type); + pl_assert(fmt->num_components); + pl_assert(fmt->internal_size); + pl_assert(fmt->opaque ? !fmt->texel_size : fmt->texel_size); + pl_assert(!fmt->gatherable || (fmt->caps & PL_FMT_CAP_SAMPLEABLE)); + for (int i = 0; i < fmt->num_components; i++) { + pl_assert(fmt->component_depth[i]); + pl_assert(fmt->opaque ? !fmt->host_bits[i] : fmt->host_bits[i]); + } + for (int i = 0; i < fmt->num_planes; i++) + pl_assert(fmt->planes[i].format); + + enum pl_fmt_caps texel_caps = PL_FMT_CAP_VERTEX | + PL_FMT_CAP_TEXEL_UNIFORM | + PL_FMT_CAP_TEXEL_STORAGE; + + if (fmt->caps & texel_caps) { + pl_assert(fmt->glsl_type); + pl_assert(!fmt->opaque); + } + if (!fmt->opaque) { + pl_assert(fmt->texel_size && fmt->texel_align); + pl_assert((fmt->texel_size % fmt->texel_align) == 0); + pl_assert(fmt->internal_size == fmt->texel_size || fmt->emulated); + } else { + pl_assert(!fmt->texel_size && !fmt->texel_align); + pl_assert(!(fmt->caps & PL_FMT_CAP_HOST_READABLE)); + } + + // Assert uniqueness of name + for (int o = n + 1; o < gpu->num_formats; o++) + pl_assert(strcmp(fmt->name, gpu->formats[o]->name) != 0); + } + + // Print info + PL_INFO(gpu, "GPU information:"); + +#define LOG(fmt, field) \ + PL_INFO(gpu, " %-26s %" fmt, #field ":", gpu->LOG_STRUCT.field) + +#define LOG_STRUCT glsl + PL_INFO(gpu, " GLSL version: %d%s", gpu->glsl.version, + gpu->glsl.vulkan ? " (vulkan)" : gpu->glsl.gles ? " es" : ""); + if (gpu->glsl.compute) { + LOG("zu", max_shmem_size); + LOG(PRIu32, max_group_threads); + LOG(PRIu32, max_group_size[0]); + LOG(PRIu32, max_group_size[1]); + LOG(PRIu32, max_group_size[2]); + } + LOG(PRIu32, subgroup_size); + LOG(PRIi16, min_gather_offset); + LOG(PRIi16, max_gather_offset); +#undef LOG_STRUCT + +#define LOG_STRUCT limits + PL_INFO(gpu, " Limits:"); + // pl_gpu + LOG("d", thread_safe); + LOG("d", callbacks); + // pl_buf + LOG("zu", max_buf_size); + LOG("zu", max_ubo_size); + LOG("zu", max_ssbo_size); + LOG("zu", max_vbo_size); + LOG("zu", max_mapped_size); + LOG(PRIu64, max_buffer_texels); + LOG("zu", align_host_ptr); + LOG("d", host_cached); + // pl_tex + LOG(PRIu32, max_tex_1d_dim); + LOG(PRIu32, max_tex_2d_dim); + LOG(PRIu32, max_tex_3d_dim); + LOG("d", blittable_1d_3d); + LOG("d", buf_transfer); + LOG("zu", align_tex_xfer_pitch); + LOG("zu", align_tex_xfer_offset); + // pl_pass + LOG("zu", max_variable_comps); + LOG("zu", max_constants); + LOG("zu", max_pushc_size); + LOG("zu", align_vertex_stride); + if (gpu->glsl.compute) { + LOG(PRIu32, max_dispatch[0]); + LOG(PRIu32, max_dispatch[1]); + LOG(PRIu32, max_dispatch[2]); + } + LOG(PRIu32, fragment_queues); + LOG(PRIu32, compute_queues); +#undef LOG_STRUCT +#undef LOG + + if (pl_gpu_supports_interop(gpu)) { + PL_INFO(gpu, " External API interop:"); + + PL_INFO(gpu, " UUID: %s", PRINT_UUID(gpu->uuid)); + PL_INFO(gpu, " PCI: %04x:%02x:%02x:%x", + gpu->pci.domain, gpu->pci.bus, gpu->pci.device, gpu->pci.function); + PL_INFO(gpu, " buf export caps: 0x%x", + (unsigned int) gpu->export_caps.buf); + PL_INFO(gpu, " buf import caps: 0x%x", + (unsigned int) gpu->import_caps.buf); + PL_INFO(gpu, " tex export caps: 0x%x", + (unsigned int) gpu->export_caps.tex); + PL_INFO(gpu, " tex import caps: 0x%x", + (unsigned int) gpu->import_caps.tex); + PL_INFO(gpu, " sync export caps: 0x%x", + (unsigned int) gpu->export_caps.sync); + PL_INFO(gpu, " sync import caps: 0x%x", + (unsigned int) gpu->import_caps.sync); + } + + print_formats(gpu); + + // Finally, create a `pl_dispatch` object for internal operations + struct pl_gpu_fns *impl = PL_PRIV(gpu); + atomic_init(&impl->cache, NULL); + impl->dp = pl_dispatch_create(gpu->log, gpu); + return gpu; +} + +struct glsl_fmt { + enum pl_fmt_type type; + int num_components; + int depth[4]; + const char *glsl_format; +}; + +// List taken from the GLSL specification. (Yes, GLSL supports only exactly +// these formats with exactly these names) +static const struct glsl_fmt pl_glsl_fmts[] = { + {PL_FMT_FLOAT, 1, {16}, "r16f"}, + {PL_FMT_FLOAT, 1, {32}, "r32f"}, + {PL_FMT_FLOAT, 2, {16, 16}, "rg16f"}, + {PL_FMT_FLOAT, 2, {32, 32}, "rg32f"}, + {PL_FMT_FLOAT, 4, {16, 16, 16, 16}, "rgba16f"}, + {PL_FMT_FLOAT, 4, {32, 32, 32, 32}, "rgba32f"}, + {PL_FMT_FLOAT, 3, {11, 11, 10}, "r11f_g11f_b10f"}, + + {PL_FMT_UNORM, 1, {8}, "r8"}, + {PL_FMT_UNORM, 1, {16}, "r16"}, + {PL_FMT_UNORM, 2, {8, 8}, "rg8"}, + {PL_FMT_UNORM, 2, {16, 16}, "rg16"}, + {PL_FMT_UNORM, 4, {8, 8, 8, 8}, "rgba8"}, + {PL_FMT_UNORM, 4, {16, 16, 16, 16}, "rgba16"}, + {PL_FMT_UNORM, 4, {10, 10, 10, 2}, "rgb10_a2"}, + + {PL_FMT_SNORM, 1, {8}, "r8_snorm"}, + {PL_FMT_SNORM, 1, {16}, "r16_snorm"}, + {PL_FMT_SNORM, 2, {8, 8}, "rg8_snorm"}, + {PL_FMT_SNORM, 2, {16, 16}, "rg16_snorm"}, + {PL_FMT_SNORM, 4, {8, 8, 8, 8}, "rgba8_snorm"}, + {PL_FMT_SNORM, 4, {16, 16, 16, 16}, "rgba16_snorm"}, + + {PL_FMT_UINT, 1, {8}, "r8ui"}, + {PL_FMT_UINT, 1, {16}, "r16ui"}, + {PL_FMT_UINT, 1, {32}, "r32ui"}, + {PL_FMT_UINT, 2, {8, 8}, "rg8ui"}, + {PL_FMT_UINT, 2, {16, 16}, "rg16ui"}, + {PL_FMT_UINT, 2, {32, 32}, "rg32ui"}, + {PL_FMT_UINT, 4, {8, 8, 8, 8}, "rgba8ui"}, + {PL_FMT_UINT, 4, {16, 16, 16, 16}, "rgba16ui"}, + {PL_FMT_UINT, 4, {32, 32, 32, 32}, "rgba32ui"}, + {PL_FMT_UINT, 4, {10, 10, 10, 2}, "rgb10_a2ui"}, + + {PL_FMT_SINT, 1, {8}, "r8i"}, + {PL_FMT_SINT, 1, {16}, "r16i"}, + {PL_FMT_SINT, 1, {32}, "r32i"}, + {PL_FMT_SINT, 2, {8, 8}, "rg8i"}, + {PL_FMT_SINT, 2, {16, 16}, "rg16i"}, + {PL_FMT_SINT, 2, {32, 32}, "rg32i"}, + {PL_FMT_SINT, 4, {8, 8, 8, 8}, "rgba8i"}, + {PL_FMT_SINT, 4, {16, 16, 16, 16}, "rgba16i"}, + {PL_FMT_SINT, 4, {32, 32, 32, 32}, "rgba32i"}, +}; + +const char *pl_fmt_glsl_format(pl_fmt fmt, int components) +{ + if (fmt->opaque) + return NULL; + + for (int n = 0; n < PL_ARRAY_SIZE(pl_glsl_fmts); n++) { + const struct glsl_fmt *gfmt = &pl_glsl_fmts[n]; + + if (fmt->type != gfmt->type) + continue; + if (components != gfmt->num_components) + continue; + + // The component order is irrelevant, so we need to sort the depth + // based on the component's index + int depth[4] = {0}; + for (int i = 0; i < fmt->num_components; i++) + depth[fmt->sample_order[i]] = fmt->component_depth[i]; + + // Copy over any emulated components + for (int i = fmt->num_components; i < components; i++) + depth[i] = gfmt->depth[i]; + + for (int i = 0; i < PL_ARRAY_SIZE(depth); i++) { + if (depth[i] != gfmt->depth[i]) + goto next_fmt; + } + + return gfmt->glsl_format; + +next_fmt: ; // equivalent to `continue` + } + + return NULL; +} + +#define FOURCC(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | \ + ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) + +struct pl_fmt_fourcc { + const char *name; + uint32_t fourcc; +}; + +static const struct pl_fmt_fourcc pl_fmt_fourccs[] = { + // 8 bpp red + {"r8", FOURCC('R','8',' ',' ')}, + // 16 bpp red + {"r16", FOURCC('R','1','6',' ')}, + // 16 bpp rg + {"rg8", FOURCC('G','R','8','8')}, + {"gr8", FOURCC('R','G','8','8')}, + // 32 bpp rg + {"rg16", FOURCC('G','R','3','2')}, + {"gr16", FOURCC('R','G','3','2')}, + // 8 bpp rgb: N/A + // 16 bpp rgb + {"argb4", FOURCC('B','A','1','2')}, + {"abgr4", FOURCC('R','A','1','2')}, + {"rgba4", FOURCC('A','B','1','2')}, + {"bgra4", FOURCC('A','R','1','2')}, + + {"a1rgb5", FOURCC('B','A','1','5')}, + {"a1bgr5", FOURCC('R','A','1','5')}, + {"rgb5a1", FOURCC('A','B','1','5')}, + {"bgr5a1", FOURCC('A','R','1','5')}, + + {"rgb565", FOURCC('B','G','1','6')}, + {"bgr565", FOURCC('R','G','1','6')}, + // 24 bpp rgb + {"rgb8", FOURCC('B','G','2','4')}, + {"bgr8", FOURCC('R','G','2','4')}, + // 32 bpp rgb + {"argb8", FOURCC('B','A','2','4')}, + {"abgr8", FOURCC('R','A','2','4')}, + {"rgba8", FOURCC('A','B','2','4')}, + {"bgra8", FOURCC('A','R','2','4')}, + + {"a2rgb10", FOURCC('B','A','3','0')}, + {"a2bgr10", FOURCC('R','A','3','0')}, + {"rgb10a2", FOURCC('A','B','3','0')}, + {"bgr10a2", FOURCC('A','R','3','0')}, + // 64bpp rgb + {"rgba16hf", FOURCC('A','B','4','H')}, + {"bgra16hf", FOURCC('A','R','4','H')}, + + // packed 16-bit formats + // rx10: N/A + // rxgx10: N/A + {"rxgxbxax10", FOURCC('A','B','1','0')}, + // rx12: N/A + // rxgx12: N/A + // rxgxbxax12: N/A + + // planar formats + {"g8_b8_r8_420", FOURCC('Y','U','1','2')}, + {"g8_b8_r8_422", FOURCC('Y','U','1','6')}, + {"g8_b8_r8_444", FOURCC('Y','U','2','4')}, + // g16_b18_r8_*: N/A + // gx10_bx10_rx10_42*: N/A + {"gx10_bx10_rx10_444", FOURCC('Q','4','1','0')}, + // gx12_bx12_rx12_*:N/A + {"g8_br8_420", FOURCC('N','V','1','2')}, + {"g8_br8_422", FOURCC('N','V','1','6')}, + {"g8_br8_444", FOURCC('N','V','2','4')}, + {"g16_br16_420", FOURCC('P','0','1','6')}, + // g16_br16_422: N/A + // g16_br16_444: N/A + {"gx10_bxrx10_420", FOURCC('P','0','1','0')}, + {"gx10_bxrx10_422", FOURCC('P','2','1','0')}, + // gx10_bxrx10_444: N/A + {"gx12_bxrx12_420", FOURCC('P','0','1','2')}, + // gx12_bxrx12_422: N/A + // gx12_bxrx12_444: N/A +}; + +uint32_t pl_fmt_fourcc(pl_fmt fmt) +{ + for (int n = 0; n < PL_ARRAY_SIZE(pl_fmt_fourccs); n++) { + const struct pl_fmt_fourcc *fourcc = &pl_fmt_fourccs[n]; + if (strcmp(fmt->name, fourcc->name) == 0) + return fourcc->fourcc; + } + + return 0; // no matching format +} + +size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par) +{ + int w = pl_rect_w(par->rc), h = pl_rect_h(par->rc), d = pl_rect_d(par->rc); + size_t pixel_pitch = par->tex->params.format->texel_size; + + // This generates the absolute bare minimum size of a buffer required to + // hold the data of a texture upload/download, by including stride padding + // only where strictly necessary. + return (d - 1) * par->depth_pitch + (h - 1) * par->row_pitch + w * pixel_pitch; +} + +int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt, + const struct pl_tex_transfer_params *params, + struct pl_tex_transfer_params **out_slices) +{ + PL_ARRAY(struct pl_tex_transfer_params) slices = {0}; + size_t max_size = params->buf ? gpu->limits.max_buf_size : SIZE_MAX; + + pl_fmt fmt = params->tex->params.format; + if (fmt->emulated && texel_fmt) { + size_t max_texel = gpu->limits.max_buffer_texels * texel_fmt->texel_size; + max_size = PL_MIN(gpu->limits.max_ssbo_size, max_texel); + } + + int slice_w = pl_rect_w(params->rc); + int slice_h = pl_rect_h(params->rc); + int slice_d = pl_rect_d(params->rc); + + slice_d = PL_MIN(slice_d, max_size / params->depth_pitch); + if (!slice_d) { + slice_d = 1; + slice_h = PL_MIN(slice_h, max_size / params->row_pitch); + if (!slice_h) { + slice_h = 1; + slice_w = PL_MIN(slice_w, max_size / fmt->texel_size); + pl_assert(slice_w); + } + } + + for (int z = 0; z < pl_rect_d(params->rc); z += slice_d) { + for (int y = 0; y < pl_rect_h(params->rc); y += slice_h) { + for (int x = 0; x < pl_rect_w(params->rc); x += slice_w) { + struct pl_tex_transfer_params slice = *params; + slice.callback = NULL; + slice.rc.x0 = params->rc.x0 + x; + slice.rc.y0 = params->rc.y0 + y; + slice.rc.z0 = params->rc.z0 + z; + slice.rc.x1 = PL_MIN(slice.rc.x0 + slice_w, params->rc.x1); + slice.rc.y1 = PL_MIN(slice.rc.y0 + slice_h, params->rc.y1); + slice.rc.z1 = PL_MIN(slice.rc.z0 + slice_d, params->rc.z1); + + const size_t offset = z * params->depth_pitch + + y * params->row_pitch + + x * fmt->texel_size; + if (slice.ptr) { + slice.ptr = (uint8_t *) slice.ptr + offset; + } else { + slice.buf_offset += offset; + } + + PL_ARRAY_APPEND(NULL, slices, slice); + } + } + } + + *out_slices = slices.elem; + return slices.num; +} + +bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + if (params->buf) + return pl_tex_upload(gpu, params); + + struct pl_buf_params bufparams = { + .size = pl_tex_transfer_size(params), + .debug_tag = PL_DEBUG_TAG, + }; + + struct pl_tex_transfer_params fixed = *params; + fixed.ptr = NULL; + + // If we can import host pointers directly, and the function is being used + // asynchronously, then we can use host pointer import to skip a memcpy. In + // the synchronous case, we still force a host memcpy to avoid stalling the + // host until the GPU memcpy completes. + bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR; + can_import &= !params->no_import; + can_import &= params->callback != NULL; + can_import &= bufparams.size > (32 << 10); // 32 KiB + if (can_import) { + bufparams.import_handle = PL_HANDLE_HOST_PTR; + bufparams.shared_mem = (struct pl_shared_mem) { + .handle.ptr = params->ptr, + .size = bufparams.size, + .offset = 0, + }; + + // Suppress errors for this test because it may fail, in which case we + // want to silently fall back. + pl_log_level_cap(gpu->log, PL_LOG_DEBUG); + fixed.buf = pl_buf_create(gpu, &bufparams); + pl_log_level_cap(gpu->log, PL_LOG_NONE); + } + + if (!fixed.buf) { + bufparams.import_handle = 0; + bufparams.host_writable = true; + fixed.buf = pl_buf_create(gpu, &bufparams); + if (!fixed.buf) + return false; + pl_buf_write(gpu, fixed.buf, 0, params->ptr, bufparams.size); + if (params->callback) + params->callback(params->priv); + fixed.callback = NULL; + } + + bool ok = pl_tex_upload(gpu, &fixed); + pl_buf_destroy(gpu, &fixed.buf); + return ok; +} + +struct pbo_cb_ctx { + pl_gpu gpu; + pl_buf buf; + void *ptr; + void (*callback)(void *priv); + void *priv; +}; + +static void pbo_download_cb(void *priv) +{ + struct pbo_cb_ctx *p = priv; + pl_buf_read(p->gpu, p->buf, 0, p->ptr, p->buf->params.size); + pl_buf_destroy(p->gpu, &p->buf); + + // Run the original callback + p->callback(p->priv); + pl_free(priv); +}; + +bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + if (params->buf) + return pl_tex_download(gpu, params); + + pl_buf buf = NULL; + struct pl_buf_params bufparams = { + .size = pl_tex_transfer_size(params), + .debug_tag = PL_DEBUG_TAG, + }; + + // If we can import host pointers directly, we can avoid an extra memcpy + // (sometimes). In the cases where it isn't avoidable, the extra memcpy + // will happen inside VRAM, which is typically faster anyway. + bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR; + can_import &= !params->no_import; + can_import &= bufparams.size > (32 << 10); // 32 KiB + if (can_import) { + bufparams.import_handle = PL_HANDLE_HOST_PTR; + bufparams.shared_mem = (struct pl_shared_mem) { + .handle.ptr = params->ptr, + .size = bufparams.size, + .offset = 0, + }; + + // Suppress errors for this test because it may fail, in which case we + // want to silently fall back. + pl_log_level_cap(gpu->log, PL_LOG_DEBUG); + buf = pl_buf_create(gpu, &bufparams); + pl_log_level_cap(gpu->log, PL_LOG_NONE); + } + + if (!buf) { + // Fallback when host pointer import is not supported + bufparams.import_handle = 0; + bufparams.host_readable = true; + buf = pl_buf_create(gpu, &bufparams); + } + + if (!buf) + return false; + + struct pl_tex_transfer_params newparams = *params; + newparams.ptr = NULL; + newparams.buf = buf; + + // If the transfer is asynchronous, propagate our host read asynchronously + if (params->callback && !bufparams.import_handle) { + newparams.callback = pbo_download_cb; + newparams.priv = pl_alloc_struct(NULL, struct pbo_cb_ctx, { + .gpu = gpu, + .buf = buf, + .ptr = params->ptr, + .callback = params->callback, + .priv = params->priv, + }); + } + + if (!pl_tex_download(gpu, &newparams)) { + pl_buf_destroy(gpu, &buf); + return false; + } + + if (!params->callback) { + while (pl_buf_poll(gpu, buf, 10000000)) // 10 ms + PL_TRACE(gpu, "pl_tex_download: synchronous/blocking (slow path)"); + } + + bool ok; + if (bufparams.import_handle) { + // Buffer download completion already means the host pointer contains + // the valid data, no more need to copy. (Note: this applies even for + // asynchronous downloads) + ok = true; + pl_buf_destroy(gpu, &buf); + } else if (!params->callback) { + // Synchronous read back to the host pointer + ok = pl_buf_read(gpu, buf, 0, params->ptr, bufparams.size); + pl_buf_destroy(gpu, &buf); + } else { + // Nothing left to do here, the rest will be done by pbo_download_cb + ok = true; + } + + return ok; +} + +bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + const int threads = PL_MIN(256, pl_rect_w(params->rc)); + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_require(gpu, params->buf); + + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, threads, 1, false, 0)) { + PL_ERR(gpu, "Failed emulating texture transfer!"); + pl_dispatch_abort(dp, &sh); + return false; + } + + ident_t buf = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->buf, + .desc = { + .name = "data", + .type = PL_DESC_BUF_TEXEL_STORAGE, + }, + }); + + ident_t img = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = { + .name = "image", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + }); + + // If the transfer width is a natural multiple of the thread size, we + // can skip the bounds check. Otherwise, make sure we aren't blitting out + // of the range since this would read out of bounds. + int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads); + if (groups_x * threads != pl_rect_w(params->rc)) { + GLSL("if (gl_GlobalInvocationID.x >= %d) \n" + " return; \n", + pl_rect_w(params->rc)); + } + + // fmt->texel_align contains the size of an individual color value + assert(fmt->texel_size == fmt->num_components * fmt->texel_align); + GLSL("vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \n" + "ivec3 pos = ivec3(gl_GlobalInvocationID); \n" + "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n" + "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n", + SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0), + SH_INT_DYN(params->buf_offset), + SH_INT(params->depth_pitch / fmt->texel_align), + SH_INT(params->row_pitch / fmt->texel_align), + SH_INT(fmt->texel_size / fmt->texel_align)); + + for (int i = 0; i < fmt->num_components; i++) + GLSL("color[%d] = imageLoad("$", base + %d).r; \n", i, buf, i); + + int dims = pl_tex_params_dimension(tex->params); + static const char *coord_types[] = { + [1] = "int", + [2] = "ivec2", + [3] = "ivec3", + }; + + GLSL("imageStore("$", %s(tex_pos), color);\n", img, coord_types[dims]); + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + groups_x, + pl_rect_h(params->rc), + pl_rect_d(params->rc), + }, + )); + +error: + return false; +} + +bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + const int threads = PL_MIN(256, pl_rect_w(params->rc)); + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_require(gpu, params->buf); + + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, threads, 1, false, 0)) { + PL_ERR(gpu, "Failed emulating texture transfer!"); + pl_dispatch_abort(dp, &sh); + return false; + } + + ident_t buf = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->buf, + .desc = { + .name = "data", + .type = PL_DESC_BUF_TEXEL_STORAGE, + }, + }); + + ident_t img = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = { + .name = "image", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_READONLY, + }, + }); + + int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads); + if (groups_x * threads != pl_rect_w(params->rc)) { + GLSL("if (gl_GlobalInvocationID.x >= %d) \n" + " return; \n", + pl_rect_w(params->rc)); + } + + int dims = pl_tex_params_dimension(tex->params); + static const char *coord_types[] = { + [1] = "int", + [2] = "ivec2", + [3] = "ivec3", + }; + + assert(fmt->texel_size == fmt->num_components * fmt->texel_align); + GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n" + "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n" + "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n" + "vec4 color = imageLoad("$", %s(tex_pos)); \n", + SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0), + SH_INT_DYN(params->buf_offset), + SH_INT(params->depth_pitch / fmt->texel_align), + SH_INT(params->row_pitch / fmt->texel_align), + SH_INT(fmt->texel_size / fmt->texel_align), + img, coord_types[dims]); + + for (int i = 0; i < fmt->num_components; i++) + GLSL("imageStore("$", base + %d, vec4(color[%d])); \n", buf, i, i); + + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + groups_x, + pl_rect_h(params->rc), + pl_rect_d(params->rc), + }, + )); + +error: + return false; +} + +bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + if (!params->dst->params.storable) + return false; + + // Normalize `dst_rc`, moving all flipping to `src_rc` instead. + pl_rect3d src_rc = params->src_rc; + pl_rect3d dst_rc = params->dst_rc; + if (pl_rect_w(dst_rc) < 0) { + PL_SWAP(src_rc.x0, src_rc.x1); + PL_SWAP(dst_rc.x0, dst_rc.x1); + } + if (pl_rect_h(dst_rc) < 0) { + PL_SWAP(src_rc.y0, src_rc.y1); + PL_SWAP(dst_rc.y0, dst_rc.y1); + } + if (pl_rect_d(dst_rc) < 0) { + PL_SWAP(src_rc.z0, src_rc.z1); + PL_SWAP(dst_rc.z0, dst_rc.z1); + } + + bool needs_scaling = false; + needs_scaling |= pl_rect_w(dst_rc) != abs(pl_rect_w(src_rc)); + needs_scaling |= pl_rect_h(dst_rc) != abs(pl_rect_h(src_rc)); + needs_scaling |= pl_rect_d(dst_rc) != abs(pl_rect_d(src_rc)); + + // Exception: fast path for 1-pixel blits, which don't require scaling + bool is_1pixel = abs(pl_rect_w(src_rc)) == 1 && abs(pl_rect_h(src_rc)) == 1; + needs_scaling &= !is_1pixel; + + // Manual trilinear interpolation would be too slow to justify + bool needs_sampling = needs_scaling && params->sample_mode != PL_TEX_SAMPLE_NEAREST; + needs_sampling |= !params->src->params.storable; + if (needs_sampling && !params->src->params.sampleable) + return false; + + const int threads = 256; + int bw = PL_MIN(32, pl_rect_w(dst_rc)); + int bh = PL_MIN(threads / bw, pl_rect_h(dst_rc)); + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, bw, bh, false, 0)) { + pl_dispatch_abort(dp, &sh); + return false; + } + + // Avoid over-writing into `dst` + int groups_x = PL_DIV_UP(pl_rect_w(dst_rc), bw); + if (groups_x * bw != pl_rect_w(dst_rc)) { + GLSL("if (gl_GlobalInvocationID.x >= %d) \n" + " return; \n", + pl_rect_w(dst_rc)); + } + + int groups_y = PL_DIV_UP(pl_rect_h(dst_rc), bh); + if (groups_y * bh != pl_rect_h(dst_rc)) { + GLSL("if (gl_GlobalInvocationID.y >= %d) \n" + " return; \n", + pl_rect_h(dst_rc)); + } + + ident_t dst = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->dst, + .desc = { + .name = "dst", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + }); + + static const char *vecs[] = { + [1] = "float", + [2] = "vec2", + [3] = "vec3", + [4] = "vec4", + }; + + static const char *ivecs[] = { + [1] = "int", + [2] = "ivec2", + [3] = "ivec3", + [4] = "ivec4", + }; + + int src_dims = pl_tex_params_dimension(params->src->params); + int dst_dims = pl_tex_params_dimension(params->dst->params); + GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n" + "%s dst_pos = %s(pos + ivec3(%d, %d, %d)); \n", + ivecs[dst_dims], ivecs[dst_dims], + params->dst_rc.x0, params->dst_rc.y0, params->dst_rc.z0); + + if (needs_sampling || (needs_scaling && params->src->params.sampleable)) { + + ident_t src = sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = "src", + .type = PL_DESC_SAMPLED_TEX, + }, + .binding = { + .object = params->src, + .address_mode = PL_TEX_ADDRESS_CLAMP, + .sample_mode = params->sample_mode, + } + }); + + if (is_1pixel) { + GLSL("%s fpos = %s(0.5); \n", vecs[src_dims], vecs[src_dims]); + } else { + GLSL("vec3 fpos = (vec3(pos) + vec3(0.5)) / vec3(%d.0, %d.0, %d.0); \n", + pl_rect_w(dst_rc), pl_rect_h(dst_rc), pl_rect_d(dst_rc)); + } + + GLSL("%s src_pos = %s(0.5); \n" + "src_pos.x = mix(%f, %f, fpos.x); \n", + vecs[src_dims], vecs[src_dims], + (float) src_rc.x0 / params->src->params.w, + (float) src_rc.x1 / params->src->params.w); + + if (params->src->params.h) { + GLSL("src_pos.y = mix(%f, %f, fpos.y); \n", + (float) src_rc.y0 / params->src->params.h, + (float) src_rc.y1 / params->src->params.h); + } + + if (params->src->params.d) { + GLSL("src_pos.z = mix(%f, %f, fpos.z); \n", + (float) src_rc.z0 / params->src->params.d, + (float) src_rc.z1 / params->src->params.d); + } + + GLSL("imageStore("$", dst_pos, textureLod("$", src_pos, 0.0)); \n", + dst, src); + + } else { + + ident_t src = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->src, + .desc = { + .name = "src", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_READONLY, + }, + }); + + if (is_1pixel) { + GLSL("ivec3 src_pos = ivec3(0); \n"); + } else if (needs_scaling) { + GLSL("ivec3 src_pos = ivec3(vec3(%f, %f, %f) * vec3(pos)); \n", + fabs((float) pl_rect_w(src_rc) / pl_rect_w(dst_rc)), + fabs((float) pl_rect_h(src_rc) / pl_rect_h(dst_rc)), + fabs((float) pl_rect_d(src_rc) / pl_rect_d(dst_rc))); + } else { + GLSL("ivec3 src_pos = pos; \n"); + } + + GLSL("src_pos = ivec3(%d, %d, %d) * src_pos + ivec3(%d, %d, %d); \n" + "imageStore("$", dst_pos, imageLoad("$", %s(src_pos))); \n", + src_rc.x1 < src_rc.x0 ? -1 : 1, + src_rc.y1 < src_rc.y0 ? -1 : 1, + src_rc.z1 < src_rc.z0 ? -1 : 1, + src_rc.x0, src_rc.y0, src_rc.z0, + dst, src, ivecs[src_dims]); + + } + + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + groups_x, + groups_y, + pl_rect_d(dst_rc), + }, + )); +} + +void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + enum pl_fmt_type src_type = params->src->params.format->type; + enum pl_fmt_type dst_type = params->dst->params.format->type; + + // Only for 2D textures + pl_assert(params->src->params.h && !params->src->params.d); + pl_assert(params->dst->params.h && !params->dst->params.d); + + // Integer textures are not supported + pl_assert(src_type != PL_FMT_UINT && src_type != PL_FMT_SINT); + pl_assert(dst_type != PL_FMT_UINT && dst_type != PL_FMT_SINT); + + pl_rect2df src_rc = { + .x0 = params->src_rc.x0, .x1 = params->src_rc.x1, + .y0 = params->src_rc.y0, .y1 = params->src_rc.y1, + }; + pl_rect2d dst_rc = { + .x0 = params->dst_rc.x0, .x1 = params->dst_rc.x1, + .y0 = params->dst_rc.y0, .y1 = params->dst_rc.y1, + }; + + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + sh->output = PL_SHADER_SIG_COLOR; + + ident_t pos, src = sh_bind(sh, params->src, PL_TEX_ADDRESS_CLAMP, + params->sample_mode, "src_tex", &src_rc, &pos, NULL); + + GLSL("vec4 color = textureLod("$", "$", 0.0); \n", src, pos); + + pl_dispatch_finish(dp, pl_dispatch_params( + .shader = &sh, + .target = params->dst, + .rect = dst_rc, + )); +} + +bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params) +{ + pl_buf src = params->src, dst = params->dst; + pl_require(gpu, src->params.storable && dst->params.storable); + pl_require(gpu, params->src_offset % sizeof(unsigned) == 0); + pl_require(gpu, params->dst_offset % sizeof(unsigned) == 0); + pl_require(gpu, params->src_offset + params->size <= src->params.size); + pl_require(gpu, params->dst_offset + params->size <= dst->params.size); + pl_require(gpu, src != dst || params->src_offset == params->dst_offset); + pl_require(gpu, params->size % sizeof(unsigned) == 0); + pl_require(gpu, params->wordsize == sizeof(uint16_t) || + params->wordsize == sizeof(uint32_t)); + + const size_t words = params->size / sizeof(unsigned); + const size_t src_off = params->src_offset / sizeof(unsigned); + const size_t dst_off = params->dst_offset / sizeof(unsigned); + + const int threads = PL_MIN(256, words); + pl_dispatch dp = pl_gpu_dispatch(gpu); + pl_shader sh = pl_dispatch_begin(dp); + if (!sh_try_compute(sh, threads, 1, false, 0)) { + pl_dispatch_abort(dp, &sh); + return false; + } + + const size_t groups = PL_DIV_UP(words, threads); + if (groups * threads > words) { + GLSL("if (gl_GlobalInvocationID.x >= %zu) \n" + " return; \n", + words); + } + + sh_desc(sh, (struct pl_shader_desc) { + .binding.object = src, + .desc = { + .name = "SrcBuf", + .type = PL_DESC_BUF_STORAGE, + .access = src == dst ? PL_DESC_ACCESS_READWRITE : PL_DESC_ACCESS_READONLY, + }, + .num_buffer_vars = 1, + .buffer_vars = &(struct pl_buffer_var) { + .var = { + .name = "src", + .type = PL_VAR_UINT, + .dim_v = 1, + .dim_m = 1, + .dim_a = src_off + words, + }, + }, + }); + + if (src != dst) { + sh_desc(sh, (struct pl_shader_desc) { + .binding.object = dst, + .desc = { + .name = "DstBuf", + .type = PL_DESC_BUF_STORAGE, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + .num_buffer_vars = 1, + .buffer_vars = &(struct pl_buffer_var) { + .var = { + .name = "dst", + .type = PL_VAR_UINT, + .dim_v = 1, + .dim_m = 1, + .dim_a = dst_off + words, + }, + }, + }); + } else { + GLSL("#define dst src \n"); + } + + GLSL("// pl_buf_copy_swap \n" + "{ \n" + "uint word = src["$" + gl_GlobalInvocationID.x]; \n" + "word = (word & 0xFF00FF00u) >> 8 | \n" + " (word & 0x00FF00FFu) << 8; \n", + SH_UINT(src_off)); + if (params->wordsize > 2) { + GLSL("word = (word & 0xFFFF0000u) >> 16 | \n" + " (word & 0x0000FFFFu) << 16; \n"); + } + GLSL("dst["$" + gl_GlobalInvocationID.x] = word; \n" + "} \n", + SH_UINT(dst_off)); + + return pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = {groups, 1, 1}, + )); + +error: + if (src->params.debug_tag || dst->params.debug_tag) { + PL_ERR(gpu, " for buffers: src %s, dst %s", + src->params.debug_tag, dst->params.debug_tag); + } + return false; +} + +void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + if (!params->vertex_data && !params->index_data) + return pl_pass_run(gpu, params); + + struct pl_pass_run_params newparams = *params; + pl_buf vert = NULL, index = NULL; + + if (params->vertex_data) { + vert = pl_buf_create(gpu, pl_buf_params( + .size = pl_vertex_buf_size(params), + .initial_data = params->vertex_data, + .drawable = true, + )); + + if (!vert) { + PL_ERR(gpu, "Failed allocating vertex buffer!"); + return; + } + + newparams.vertex_buf = vert; + newparams.vertex_data = NULL; + } + + if (params->index_data) { + index = pl_buf_create(gpu, pl_buf_params( + .size = pl_index_buf_size(params), + .initial_data = params->index_data, + .drawable = true, + )); + + if (!index) { + PL_ERR(gpu, "Failed allocating index buffer!"); + return; + } + + newparams.index_buf = index; + newparams.index_data = NULL; + } + + pl_pass_run(gpu, &newparams); + pl_buf_destroy(gpu, &vert); + pl_buf_destroy(gpu, &index); +} + +struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params) +{ + struct pl_pass_params new = *params; + + new.glsl_shader = pl_str0dup0(alloc, new.glsl_shader); + new.vertex_shader = pl_str0dup0(alloc, new.vertex_shader); + if (new.blend_params) + new.blend_params = pl_memdup_ptr(alloc, new.blend_params); + +#define DUPNAMES(field) \ + do { \ + size_t _size = new.num_##field * sizeof(new.field[0]); \ + new.field = pl_memdup(alloc, new.field, _size); \ + for (int j = 0; j < new.num_##field; j++) \ + new.field[j].name = pl_str0dup0(alloc, new.field[j].name); \ + } while (0) + + DUPNAMES(variables); + DUPNAMES(descriptors); + DUPNAMES(vertex_attribs); + +#undef DUPNAMES + + new.constant_data = NULL; + new.constants = pl_memdup(alloc, new.constants, + new.num_constants * sizeof(new.constants[0])); + + return new; +} + +size_t pl_vertex_buf_size(const struct pl_pass_run_params *params) +{ + if (!params->index_data) + return params->vertex_count * params->pass->params.vertex_stride; + + int num_vertices = 0; + const void *idx = params->index_data; + switch (params->index_fmt) { + case PL_INDEX_UINT16: + for (int i = 0; i < params->vertex_count; i++) + num_vertices = PL_MAX(num_vertices, ((const uint16_t *) idx)[i]); + break; + case PL_INDEX_UINT32: + for (int i = 0; i < params->vertex_count; i++) + num_vertices = PL_MAX(num_vertices, ((const uint32_t *) idx)[i]); + break; + case PL_INDEX_FORMAT_COUNT: pl_unreachable(); + } + + return (num_vertices + 1) * params->pass->params.vertex_stride; +} + +const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE]) +{ + static const char *hexdigits = "0123456789ABCDEF"; + for (int i = 0; i < UUID_SIZE; i++) { + uint8_t x = uuid[i]; + buf[3 * i + 0] = hexdigits[x >> 4]; + buf[3 * i + 1] = hexdigits[x & 0xF]; + buf[3 * i + 2] = i == UUID_SIZE - 1 ? '\0' : ':'; + } + + return buf; +} + +const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod) +{ + switch (mod) { + case DRM_FORMAT_MOD_LINEAR: return "LINEAR"; + case DRM_FORMAT_MOD_INVALID: return "INVALID"; + } + + uint8_t vendor = mod >> 56; + uint64_t val = mod & ((1ULL << 56) - 1); + + const char *name = NULL; + switch (vendor) { + case 0x00: name = "NONE"; break; + case 0x01: name = "INTEL"; break; + case 0x02: name = "AMD"; break; + case 0x03: name = "NVIDIA"; break; + case 0x04: name = "SAMSUNG"; break; + case 0x08: name = "ARM"; break; + } + + if (name) { + snprintf(buf, DRM_MOD_SIZE, "%s 0x%"PRIx64, name, val); + } else { + snprintf(buf, DRM_MOD_SIZE, "0x%02x 0x%"PRIx64, vendor, val); + } + + return buf; +} diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 0000000..2513919 --- /dev/null +++ b/src/hash.h @@ -0,0 +1,162 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +#define GOLDEN_RATIO_64 UINT64_C(0x9e3779b97f4a7c15) + +static inline void pl_hash_merge(uint64_t *accum, uint64_t hash) { + *accum ^= hash + GOLDEN_RATIO_64 + (*accum << 6) + (*accum >> 2); +} + +static inline uint64_t pl_mem_hash(const void *mem, size_t size); +#define pl_var_hash(x) pl_mem_hash(&(x), sizeof(x)) + +static inline uint64_t pl_str_hash(pl_str str) +{ + return pl_mem_hash(str.buf, str.len); +} + +static inline uint64_t pl_str0_hash(const char *str) +{ + return pl_mem_hash(str, str ? strlen(str) : 0); +} + +#ifdef PL_HAVE_XXHASH + +#define XXH_NAMESPACE pl_ +#define XXH_INLINE_ALL +#define XXH_NO_STREAM +#include <xxhash.h> + +XXH_FORCE_INLINE uint64_t pl_mem_hash(const void *mem, size_t size) +{ + return XXH3_64bits(mem, size); +} + +#else // !PL_HAVE_XXHASH + +/* + SipHash reference C implementation + Modified for use by libplacebo: + - Hard-coded a fixed key (k0 and k1) + - Hard-coded the output size to 64 bits + - Return the result vector directly + + Copyright (c) 2012-2016 Jean-Philippe Aumasson + <jeanphilippe.aumasson@gmail.com> + Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to> + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + <http://creativecommons.org/publicdomain/zero/1.0/>. + */ + +/* default: SipHash-2-4 */ +#define cROUNDS 2 +#define dROUNDS 4 + +#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) + +#define U8TO64_LE(p) \ + (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) + +#define SIPROUND \ + do { \ + v0 += v1; \ + v1 = ROTL(v1, 13); \ + v1 ^= v0; \ + v0 = ROTL(v0, 32); \ + v2 += v3; \ + v3 = ROTL(v3, 16); \ + v3 ^= v2; \ + v0 += v3; \ + v3 = ROTL(v3, 21); \ + v3 ^= v0; \ + v2 += v1; \ + v1 = ROTL(v1, 17); \ + v1 ^= v2; \ + v2 = ROTL(v2, 32); \ + } while (0) + +static inline uint64_t pl_mem_hash(const void *mem, size_t size) +{ + if (!size) + return 0x8533321381b8254bULL; + + uint64_t v0 = 0x736f6d6570736575ULL; + uint64_t v1 = 0x646f72616e646f6dULL; + uint64_t v2 = 0x6c7967656e657261ULL; + uint64_t v3 = 0x7465646279746573ULL; + uint64_t k0 = 0xfe9f075098ddb0faULL; + uint64_t k1 = 0x68f7f03510e5285cULL; + uint64_t m; + int i; + const uint8_t *buf = mem; + const uint8_t *end = buf + size - (size % sizeof(uint64_t)); + const int left = size & 7; + uint64_t b = ((uint64_t) size) << 56; + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + + for (; buf != end; buf += 8) { + m = U8TO64_LE(buf); + v3 ^= m; + + for (i = 0; i < cROUNDS; ++i) + SIPROUND; + + v0 ^= m; + } + + switch (left) { + case 7: b |= ((uint64_t) buf[6]) << 48; // fall through + case 6: b |= ((uint64_t) buf[5]) << 40; // fall through + case 5: b |= ((uint64_t) buf[4]) << 32; // fall through + case 4: b |= ((uint64_t) buf[3]) << 24; // fall through + case 3: b |= ((uint64_t) buf[2]) << 16; // fall through + case 2: b |= ((uint64_t) buf[1]) << 8; // fall through + case 1: b |= ((uint64_t) buf[0]); break; + case 0: break; + } + + v3 ^= b; + + for (i = 0; i < cROUNDS; ++i) + SIPROUND; + + v0 ^= b; + + v2 ^= 0xff; + + for (i = 0; i < dROUNDS; ++i) + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; + return b; +} + +#endif // PL_HAVE_XXHASH diff --git a/src/include/libplacebo/cache.h b/src/include/libplacebo/cache.h new file mode 100644 index 0000000..5897ac8 --- /dev/null +++ b/src/include/libplacebo/cache.h @@ -0,0 +1,200 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_CACHE_H_ +#define LIBPLACEBO_CACHE_H_ + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +#include <libplacebo/config.h> +#include <libplacebo/common.h> +#include <libplacebo/log.h> + +PL_API_BEGIN + +typedef struct pl_cache_obj { + // Cache object key. This will uniquely identify this cached object. + uint64_t key; + + // Cache data pointer and length. 0-length cached objects are invalid + // and will be silently dropped. You can explicitly remove a cached + // object by overwriting it with a length 0 object. + void *data; + size_t size; + + // Free callback, to free memory associated with `data`. (Optional) + // Will be called when the object is either explicitly deleted, culled + // due to hitting size limits, or on pl_cache_destroy(). + void (*free)(void *data); +} pl_cache_obj; + +struct pl_cache_params { + // Optional `pl_log` that is used for logging internal events related + // to the cache, such as insertions, saving and loading. + pl_log log; + + // Size limits. If 0, no limit is imposed. + // + // Note: libplacebo will never detect or invalidate stale cache entries, so + // setting an upper size limit is strongly recommended + size_t max_object_size; + size_t max_total_size; + + // Optional external callback to call after a cached object is modified + // (including deletion and (re-)insertion). Note that this is not called on + // objects which are merely pruned from the cache due to `max_total_size`, + // so users must rely on some external mechanism to prune stale entries or + // enforce size limits. + // + // Note: `pl_cache_load` does not trigger this callback. + // Note: Ownership of `obj` does *not* pass to the caller. + // Note: This function must be thread safe. + void (*set)(void *priv, pl_cache_obj obj); + + // Optional external callback to call on a cache miss. Ownership of the + // returned object passes to the `pl_cache`. Objects returned by this + // callback *should* have a valid `free` callback, unless lifetime can be + // externally managed and guaranteed to outlive the `pl_cache`. + // + // Note: This function must be thread safe. + pl_cache_obj (*get)(void *priv, uint64_t key); + + // External context for insert/lookup. + void *priv; +}; + +#define pl_cache_params(...) (&(struct pl_cache_params) { __VA_ARGS__ }) +PL_API extern const struct pl_cache_params pl_cache_default_params; + +// Thread-safety: Safe +// +// Note: In any context in which `pl_cache` is used, users may also pass NULL +// to disable caching. In other words, NULL is a valid `pl_cache`. +typedef const struct pl_cache_t { + struct pl_cache_params params; +} *pl_cache; + +// Create a new cache. This function will never fail. +PL_API pl_cache pl_cache_create(const struct pl_cache_params *params); + +// Destroy a `pl_cache` object, including all underlying objects. +PL_API void pl_cache_destroy(pl_cache *cache); + +// Explicitly clear all objects in the cache without destroying it. This is +// similar to `pl_cache_destroy`, but the cache remains valid afterwards. +// +// Note: Objects destroyed in this way *not* propagated to the `set` callback. +PL_API void pl_cache_reset(pl_cache cache); + +// Return the current internal number of objects and total size (bytes) +PL_API int pl_cache_objects(pl_cache cache); +PL_API size_t pl_cache_size(pl_cache cache); + +// --- Cache saving and loading APIs + +// Serialize the internal state of a `pl_cache` into an abstract cache +// object that can be e.g. saved to disk and loaded again later. Returns the +// number of objects saved. +// +// Note: Using `save/load` is largely redundant with using `insert/lookup` +// callbacks, and the user should decide whether to use the explicit API or the +// callback-based API. +PL_API int pl_cache_save_ex(pl_cache cache, + void (*write)(void *priv, size_t size, const void *ptr), + void *priv); + +// Load the result of a previous `pl_cache_save` call. Any duplicate entries in +// the `pl_cache` will be overwritten. Returns the number of objects loaded, or +// a negative number on serious error (e.g. corrupt header) +// +// Note: This does not trigger the `update` callback. +PL_API int pl_cache_load_ex(pl_cache cache, + bool (*read)(void *priv, size_t size, void *ptr), + void *priv); + +// --- Convenience wrappers around pl_cache_save/load_ex + +// Writes data directly to a pointer. Returns the number of bytes that *would* +// have been written, so this can be used on a size 0 buffer to get the required +// total size. +PL_API size_t pl_cache_save(pl_cache cache, uint8_t *data, size_t size); + +// Reads data directly from a pointer. This still reads from `data`, so it does +// not avoid a copy. +PL_API int pl_cache_load(pl_cache cache, const uint8_t *data, size_t size); + +// Writes/loads data to/from a FILE stream at the current position. +#define pl_cache_save_file(c, file) pl_cache_save_ex(c, pl_write_file_cb, file) +#define pl_cache_load_file(c, file) pl_cache_load_ex(c, pl_read_file_cb, file) + +static inline void pl_write_file_cb(void *priv, size_t size, const void *ptr) +{ + (void) fwrite(ptr, 1, size, (FILE *) priv); +} + +static inline bool pl_read_file_cb(void *priv, size_t size, void *ptr) +{ + return fread(ptr, 1, size, (FILE *) priv) == size; +} + +// --- Object modification API. Mostly intended for internal use. + +// Insert a new cached object into a `pl_cache`. Returns whether successful. +// Overwrites any existing cached object with that signature, so this can be +// used to e.g. delete objects as well (set their size to 0). On success, +// ownership of `obj` passes to the `pl_cache`. +// +// Note: If `object.free` is NULL, this will perform an internal memdup. To +// bypass this (e.g. when directly adding externally managed memory), you can +// set the `free` callback to an explicit noop function. +// +// Note: `obj->data/free` will be reset to NULL on successful insertion. +PL_API bool pl_cache_try_set(pl_cache cache, pl_cache_obj *obj); + +// Variant of `pl_cache_try_set` that simply frees `obj` on failure. +PL_API void pl_cache_set(pl_cache cache, pl_cache_obj *obj); + +// Looks up `obj->key` in the object cache. If successful, `obj->data` is +// set to memory owned by the caller, which must be either explicitly +// re-inserted, or explicitly freed (using obj->free). +// +// Note: On failure, `obj->data/size/free` are reset to NULL. +PL_API bool pl_cache_get(pl_cache cache, pl_cache_obj *obj); + +// Run a callback on every object currently stored in `cache`. +// +// Note: Running any `pl_cache_*` function on `cache` from this callback is +// undefined behavior. +PL_API void pl_cache_iterate(pl_cache cache, + void (*cb)(void *priv, pl_cache_obj obj), + void *priv); + +// Utility wrapper to free a `pl_cache_obj` if necessary (and sanitize it) +static inline void pl_cache_obj_free(pl_cache_obj *obj) +{ + if (obj->free) + obj->free(obj->data); + obj->data = NULL; + obj->free = NULL; + obj->size = 0; +} + +PL_API_END + +#endif // LIBPLACEBO_CACHE_H_ diff --git a/src/include/libplacebo/colorspace.h b/src/include/libplacebo/colorspace.h new file mode 100644 index 0000000..6663019 --- /dev/null +++ b/src/include/libplacebo/colorspace.h @@ -0,0 +1,719 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_COLORSPACE_H_ +#define LIBPLACEBO_COLORSPACE_H_ + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +#include <libplacebo/common.h> + +PL_API_BEGIN + +// The underlying color representation (e.g. RGB, XYZ or YCbCr) +enum pl_color_system { + PL_COLOR_SYSTEM_UNKNOWN = 0, + // YCbCr-like color systems: + PL_COLOR_SYSTEM_BT_601, // ITU-R Rec. BT.601 (SD) + PL_COLOR_SYSTEM_BT_709, // ITU-R Rec. BT.709 (HD) + PL_COLOR_SYSTEM_SMPTE_240M, // SMPTE-240M + PL_COLOR_SYSTEM_BT_2020_NC, // ITU-R Rec. BT.2020 (non-constant luminance) + PL_COLOR_SYSTEM_BT_2020_C, // ITU-R Rec. BT.2020 (constant luminance) + PL_COLOR_SYSTEM_BT_2100_PQ, // ITU-R Rec. BT.2100 ICtCp PQ variant + PL_COLOR_SYSTEM_BT_2100_HLG, // ITU-R Rec. BT.2100 ICtCp HLG variant + PL_COLOR_SYSTEM_DOLBYVISION, // Dolby Vision (see pl_dovi_metadata) + PL_COLOR_SYSTEM_YCGCO, // YCgCo (derived from RGB) + // Other color systems: + PL_COLOR_SYSTEM_RGB, // Red, Green and Blue + PL_COLOR_SYSTEM_XYZ, // Digital Cinema Distribution Master (XYZ) + PL_COLOR_SYSTEM_COUNT +}; + +PL_API bool pl_color_system_is_ycbcr_like(enum pl_color_system sys); + +// Returns true for color systems that are linear transformations of the RGB +// equivalent, i.e. are simple matrix multiplications. For color systems with +// this property, `pl_color_repr_decode` is sufficient for conversion to RGB. +PL_API bool pl_color_system_is_linear(enum pl_color_system sys); + +// Guesses the best YCbCr-like colorspace based on a image given resolution. +// This only picks conservative values. (In particular, BT.2020 is never +// auto-guessed, even for 4K resolution content) +PL_API enum pl_color_system pl_color_system_guess_ycbcr(int width, int height); + +// Friendly names for the canonical channel names and order. +enum pl_channel { + PL_CHANNEL_NONE = -1, + PL_CHANNEL_A = 3, // alpha + // RGB system + PL_CHANNEL_R = 0, + PL_CHANNEL_G = 1, + PL_CHANNEL_B = 2, + // YCbCr-like systems + PL_CHANNEL_Y = 0, + PL_CHANNEL_CB = 1, + PL_CHANNEL_CR = 2, + // Aliases for Cb/Cr + PL_CHANNEL_U = 1, + PL_CHANNEL_V = 2 + // There are deliberately no names for the XYZ system to avoid + // confusion due to PL_CHANNEL_Y. +}; + +// The numerical range of the representation (where applicable). +enum pl_color_levels { + PL_COLOR_LEVELS_UNKNOWN = 0, + PL_COLOR_LEVELS_LIMITED, // Limited/TV range, e.g. 16-235 + PL_COLOR_LEVELS_FULL, // Full/PC range, e.g. 0-255 + PL_COLOR_LEVELS_COUNT, + + // Compatibility aliases + PL_COLOR_LEVELS_TV = PL_COLOR_LEVELS_LIMITED, + PL_COLOR_LEVELS_PC = PL_COLOR_LEVELS_FULL, +}; + +// The alpha representation mode. +enum pl_alpha_mode { + PL_ALPHA_UNKNOWN = 0, // or no alpha channel present + PL_ALPHA_INDEPENDENT, // alpha channel is separate from the video + PL_ALPHA_PREMULTIPLIED, // alpha channel is multiplied into the colors + PL_ALPHA_MODE_COUNT, +}; + +// The underlying bit-wise representation of a color sample. For example, +// a 10-bit TV-range YCbCr value uploaded to a 16 bit texture would have +// sample_depth=16 color_depth=10 bit_shift=0. +// +// For another example, a 12-bit XYZ full range sample shifted to 16-bits with +// the lower 4 bits all set to 0 would have sample_depth=16 color_depth=12 +// bit_shift=4. (libavcodec likes outputting this type of `xyz12`) +// +// To explain the meaning of `sample_depth` further; the consideration factor +// here is the fact that GPU sampling will normalized the sampled color to the +// range 0.0 - 1.0 in a manner dependent on the number of bits in the texture +// format. So if you upload a 10-bit YCbCr value unpadded as 16-bit color +// samples, all of the sampled values will be extremely close to 0.0. In such a +// case, `pl_color_repr_normalize` would return a high scaling factor, which +// would pull the color up to their 16-bit range. +struct pl_bit_encoding { + int sample_depth; // the number of bits the color is stored/sampled as + int color_depth; // the effective number of bits of the color information + int bit_shift; // a representational bit shift applied to the color +}; + +// Returns whether two bit encodings are exactly identical. +PL_API bool pl_bit_encoding_equal(const struct pl_bit_encoding *b1, + const struct pl_bit_encoding *b2); + +// Parsed metadata from the Dolby Vision RPU +struct pl_dovi_metadata { + // Colorspace transformation metadata + float nonlinear_offset[3]; // input offset ("ycc_to_rgb_offset") + pl_matrix3x3 nonlinear; // before PQ, also called "ycc_to_rgb" + pl_matrix3x3 linear; // after PQ, also called "rgb_to_lms" + + // Reshape data, grouped by component + struct pl_reshape_data { + uint8_t num_pivots; + float pivots[9]; // normalized to [0.0, 1.0] based on BL bit depth + uint8_t method[8]; // 0 = polynomial, 1 = MMR + // Note: these must be normalized (divide by coefficient_log2_denom) + float poly_coeffs[8][3]; // x^0, x^1, x^2, unused must be 0 + uint8_t mmr_order[8]; // 1, 2 or 3 + float mmr_constant[8]; + float mmr_coeffs[8][3 /* order */][7]; + } comp[3]; +}; + +// Struct describing the underlying color system and representation. This +// information is needed to convert an encoded color to a normalized RGB triple +// in the range 0-1. +struct pl_color_repr { + enum pl_color_system sys; + enum pl_color_levels levels; + enum pl_alpha_mode alpha; + struct pl_bit_encoding bits; // or {0} if unknown + + // Metadata for PL_COLOR_SYSTEM_DOLBYVISION. Note that, for the sake of + // efficiency, this is treated purely as an opaque reference - functions + // like pl_color_repr_equal will merely do a pointer equality test. + // + // The only functions that actually dereference it in any way are + // pl_color_repr_decode, pl_shader_decode_color and pl_render_image(_mix). + const struct pl_dovi_metadata *dovi; +}; + +// Some common color representations. It's worth pointing out that all of these +// presets leave `alpha` and `bits` as unknown - that is, only the system and +// levels are predefined +PL_API extern const struct pl_color_repr pl_color_repr_unknown; +PL_API extern const struct pl_color_repr pl_color_repr_rgb; +PL_API extern const struct pl_color_repr pl_color_repr_sdtv; +PL_API extern const struct pl_color_repr pl_color_repr_hdtv; // also Blu-ray +PL_API extern const struct pl_color_repr pl_color_repr_uhdtv; // SDR, NCL system +PL_API extern const struct pl_color_repr pl_color_repr_jpeg; + +// Returns whether two colorspace representations are exactly identical. +PL_API bool pl_color_repr_equal(const struct pl_color_repr *c1, + const struct pl_color_repr *c2); + +// Replaces unknown values in the first struct by those of the second struct. +PL_API void pl_color_repr_merge(struct pl_color_repr *orig, + const struct pl_color_repr *update); + +// This function normalizes the color representation such that +// color_depth=sample_depth and bit_shift=0; and returns the scaling factor +// that must be multiplied into the color value to accomplish this, assuming +// it has already been sampled by the GPU. If unknown, the color and sample +// depth will both be inferred as 8 bits for the purposes of this conversion. +PL_API float pl_color_repr_normalize(struct pl_color_repr *repr); + +// Guesses the best color levels based on the specified color levels and +// falling back to using the color system instead. YCbCr-like systems are +// assumed to be TV range, otherwise this defaults to PC range. +PL_API enum pl_color_levels pl_color_levels_guess(const struct pl_color_repr *repr); + +// The colorspace's primaries (gamut) +enum pl_color_primaries { + PL_COLOR_PRIM_UNKNOWN = 0, + // Standard gamut: + PL_COLOR_PRIM_BT_601_525, // ITU-R Rec. BT.601 (525-line = NTSC, SMPTE-C) + PL_COLOR_PRIM_BT_601_625, // ITU-R Rec. BT.601 (625-line = PAL, SECAM) + PL_COLOR_PRIM_BT_709, // ITU-R Rec. BT.709 (HD), also sRGB + PL_COLOR_PRIM_BT_470M, // ITU-R Rec. BT.470 M + PL_COLOR_PRIM_EBU_3213, // EBU Tech. 3213-E / JEDEC P22 phosphors + // Wide gamut: + PL_COLOR_PRIM_BT_2020, // ITU-R Rec. BT.2020 (UltraHD) + PL_COLOR_PRIM_APPLE, // Apple RGB + PL_COLOR_PRIM_ADOBE, // Adobe RGB (1998) + PL_COLOR_PRIM_PRO_PHOTO, // ProPhoto RGB (ROMM) + PL_COLOR_PRIM_CIE_1931, // CIE 1931 RGB primaries + PL_COLOR_PRIM_DCI_P3, // DCI-P3 (Digital Cinema) + PL_COLOR_PRIM_DISPLAY_P3, // DCI-P3 (Digital Cinema) with D65 white point + PL_COLOR_PRIM_V_GAMUT, // Panasonic V-Gamut (VARICAM) + PL_COLOR_PRIM_S_GAMUT, // Sony S-Gamut + PL_COLOR_PRIM_FILM_C, // Traditional film primaries with Illuminant C + PL_COLOR_PRIM_ACES_AP0, // ACES Primaries #0 (ultra wide) + PL_COLOR_PRIM_ACES_AP1, // ACES Primaries #1 + PL_COLOR_PRIM_COUNT +}; + +PL_API bool pl_color_primaries_is_wide_gamut(enum pl_color_primaries prim); + +// Guesses the best primaries based on a resolution. This always guesses +// conservatively, i.e. it will never return a wide gamut color space even if +// the resolution is 4K. +PL_API enum pl_color_primaries pl_color_primaries_guess(int width, int height); + +// The colorspace's transfer function (gamma / EOTF) +enum pl_color_transfer { + PL_COLOR_TRC_UNKNOWN = 0, + // Standard dynamic range: + PL_COLOR_TRC_BT_1886, // ITU-R Rec. BT.1886 (CRT emulation + OOTF) + PL_COLOR_TRC_SRGB, // IEC 61966-2-4 sRGB (CRT emulation) + PL_COLOR_TRC_LINEAR, // Linear light content + PL_COLOR_TRC_GAMMA18, // Pure power gamma 1.8 + PL_COLOR_TRC_GAMMA20, // Pure power gamma 2.0 + PL_COLOR_TRC_GAMMA22, // Pure power gamma 2.2 + PL_COLOR_TRC_GAMMA24, // Pure power gamma 2.4 + PL_COLOR_TRC_GAMMA26, // Pure power gamma 2.6 + PL_COLOR_TRC_GAMMA28, // Pure power gamma 2.8 + PL_COLOR_TRC_PRO_PHOTO, // ProPhoto RGB (ROMM) + PL_COLOR_TRC_ST428, // Digital Cinema Distribution Master (XYZ) + // High dynamic range: + PL_COLOR_TRC_PQ, // ITU-R BT.2100 PQ (perceptual quantizer), aka SMPTE ST2048 + PL_COLOR_TRC_HLG, // ITU-R BT.2100 HLG (hybrid log-gamma), aka ARIB STD-B67 + PL_COLOR_TRC_V_LOG, // Panasonic V-Log (VARICAM) + PL_COLOR_TRC_S_LOG1, // Sony S-Log1 + PL_COLOR_TRC_S_LOG2, // Sony S-Log2 + PL_COLOR_TRC_COUNT +}; + +// Returns the nominal peak of a given transfer function, relative to the +// reference white. This refers to the highest encodable signal level. +// Always equal to 1.0 for SDR curves. +// +// Note: For HLG in particular, which is scene-referred, this returns the +// highest nominal peak in scene-referred space (3.77), which may be different +// from the actual peak in display space after application of the HLG OOTF. +PL_API float pl_color_transfer_nominal_peak(enum pl_color_transfer trc); + +static inline bool pl_color_transfer_is_hdr(enum pl_color_transfer trc) +{ + return pl_color_transfer_nominal_peak(trc) > 1.0; +} + +// This defines the display-space standard reference white level (in cd/m^2) +// that is assumed for SDR content, for use when mapping between HDR and SDR in +// display space. See ITU-R Report BT.2408 for more information. +#define PL_COLOR_SDR_WHITE 203.0f + +// This defines the assumed contrast level of an unknown SDR display. This +// will be used to determine the black point in the absence of any tagged +// minimum luminance, relative to the tagged maximum luminance (or +// PL_COLOR_SDR_WHITE in the absence of all tagging) +#define PL_COLOR_SDR_CONTRAST 1000.0f + +// This defines the default black point assumed for "infinite contrast" HDR +// displays. This is not exactly 0.0 because a value of 0.0 is interpreted +// as "unknown / missing metadata" inside struct pl_hdr_metadata, and also +// to avoid numerical issues in a variety of tone mapping functions. +// Essentially, a black level below this number is functionally meaningless +// inside libplacebo, and will be clamped to this value regardless. +// +// The value used here (1e-6) is about one 13-bit PQ step above absolute zero, +// which is a small fraction of the human JND at this brightness level, and also +// about 3 bits above the floating point machine epsilon. +#define PL_COLOR_HDR_BLACK 1e-6f + +// This defines the assumed peak brightness of a HLG display with no HDR10 +// metadata. This is set to the brightness of a "nominal" HLG reference display. +#define PL_COLOR_HLG_PEAK 1000.0f + +// Represents a single CIE xy coordinate (e.g. CIE Yxy with Y = 1.0) +struct pl_cie_xy { + float x, y; +}; + +// Creates a pl_cie_xyz from raw XYZ values +static inline struct pl_cie_xy pl_cie_from_XYZ(float X, float Y, float Z) +{ + float k = 1.0f / (X + Y + Z); + struct pl_cie_xy xy = { k * X, k * Y }; + return xy; +} + +// Recovers (X / Y) from a CIE xy value. +static inline float pl_cie_X(struct pl_cie_xy xy) +{ + return xy.x / xy.y; +} + +// Recovers (Z / Y) from a CIE xy value. +static inline float pl_cie_Z(struct pl_cie_xy xy) +{ + return (1 - xy.x - xy.y) / xy.y; +} + +static inline bool pl_cie_xy_equal(const struct pl_cie_xy *a, + const struct pl_cie_xy *b) +{ + return a->x == b->x && a->y == b->y; +} + +// Computes the CIE xy chromaticity coordinates of a CIE D-series illuminant +// with the given correlated color temperature. +// +// `temperature` must be between 2500 K and 25000 K, inclusive. +PL_API struct pl_cie_xy pl_white_from_temp(float temperature); + +// Represents the raw physical primaries corresponding to a color space. +struct pl_raw_primaries { + struct pl_cie_xy red, green, blue, white; +}; + +// Returns whether two raw primaries are exactly identical. +PL_API bool pl_raw_primaries_equal(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b); + +// Returns whether two raw primaries are approximately equal +PL_API bool pl_raw_primaries_similar(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b); + +// Replaces unknown values in the first struct by those of the second struct. +PL_API void pl_raw_primaries_merge(struct pl_raw_primaries *orig, + const struct pl_raw_primaries *update); + +// Returns the raw primaries for a given color space. +PL_API const struct pl_raw_primaries *pl_raw_primaries_get(enum pl_color_primaries prim); + +enum pl_hdr_scaling { + PL_HDR_NORM = 0, // 0.0 is absolute black, 1.0 is PL_COLOR_SDR_WHITE + PL_HDR_SQRT, // sqrt() of PL_HDR_NORM values + PL_HDR_NITS, // absolute brightness in raw cd/m² + PL_HDR_PQ, // absolute brightness in PQ (0.0 to 1.0) + PL_HDR_SCALING_COUNT, +}; + +// Generic helper for performing HDR scale conversions. +PL_API float pl_hdr_rescale(enum pl_hdr_scaling from, enum pl_hdr_scaling to, float x); + +enum pl_hdr_metadata_type { + PL_HDR_METADATA_ANY = 0, + PL_HDR_METADATA_NONE, + PL_HDR_METADATA_HDR10, // HDR10 static mastering display metadata + PL_HDR_METADATA_HDR10PLUS, // HDR10+ dynamic metadata + PL_HDR_METADATA_CIE_Y, // CIE Y derived dynamic luminance metadata + PL_HDR_METADATA_TYPE_COUNT, +}; + +// Bezier curve for HDR metadata +struct pl_hdr_bezier { + float target_luma; // target luminance (cd/m²) for this OOTF + float knee_x, knee_y; // cross-over knee point (0-1) + float anchors[15]; // intermediate bezier curve control points (0-1) + uint8_t num_anchors; +}; + +// Represents raw HDR metadata as defined by SMPTE 2086 / CTA 861.3, which is +// often attached to HDR sources and can be forwarded to HDR-capable displays, +// or used to guide the libplacebo built-in tone mapping. Values left as 0 +// are treated as unknown by libplacebo. +// +// Note: This means that a value of `min_luma == 0.0` gets treated as "minimum +// luminance not known", which in practice may end up inferring a default +// contrast of 1000:1 for SDR transfer functions. To avoid this, the user should +// set these fields to a low positive value, e.g. PL_COLOR_HDR_BLACK, to signal +// a "zero" black point (i.e. infinite contrast display). +struct pl_hdr_metadata { + // --- PL_HDR_METADATA_HDR10 + // Mastering display metadata. + struct pl_raw_primaries prim; // mastering display primaries + float min_luma, max_luma; // min/max luminance (in cd/m²) + + // Content light level. (Note: this is ignored by libplacebo itself) + float max_cll; // max content light level (in cd/m²) + float max_fall; // max frame average light level (in cd/m²) + + // --- PL_HDR_METADATA_HDR10PLUS + float scene_max[3]; // maxSCL in cd/m² per component (RGB) + float scene_avg; // average of maxRGB in cd/m² + struct pl_hdr_bezier ootf; // reference OOTF (optional) + + // --- PL_HDR_METADATA_CIE_Y + float max_pq_y; // maximum PQ luminance (in PQ, 0-1) + float avg_pq_y; // averaged PQ luminance (in PQ, 0-1) +}; + +PL_API extern const struct pl_hdr_metadata pl_hdr_metadata_empty; // equal to {0} +PL_API extern const struct pl_hdr_metadata pl_hdr_metadata_hdr10; // generic HDR10 display + +// Returns whether two sets of HDR metadata are exactly identical. +PL_API bool pl_hdr_metadata_equal(const struct pl_hdr_metadata *a, + const struct pl_hdr_metadata *b); + +// Replaces unknown values in the first struct by those of the second struct. +PL_API void pl_hdr_metadata_merge(struct pl_hdr_metadata *orig, + const struct pl_hdr_metadata *update); + +// Returns `true` if `data` contains a complete set of a given metadata type. +// Note: for PL_HDR_METADATA_HDR10, only `min_luma` and `max_luma` are +// considered - CLL/FALL and primaries are irrelevant for HDR tone-mapping. +PL_API bool pl_hdr_metadata_contains(const struct pl_hdr_metadata *data, + enum pl_hdr_metadata_type type); + +// Rendering intent for colorspace transformations. These constants match the +// ICC specification (Table 23) +enum pl_rendering_intent { + PL_INTENT_AUTO = -1, // not a valid ICC intent, but used to auto-infer + PL_INTENT_PERCEPTUAL = 0, + PL_INTENT_RELATIVE_COLORIMETRIC = 1, + PL_INTENT_SATURATION = 2, + PL_INTENT_ABSOLUTE_COLORIMETRIC = 3 +}; + +// Struct describing a physical color space. This information is needed to +// turn a normalized RGB triple into its physical meaning, as well as to convert +// between color spaces. +struct pl_color_space { + enum pl_color_primaries primaries; + enum pl_color_transfer transfer; + + // HDR metadata for this color space, if present. (Optional) + struct pl_hdr_metadata hdr; +}; + +#define pl_color_space(...) (&(struct pl_color_space) { __VA_ARGS__ }) + +// Returns whether or not a color space is considered as effectively HDR. +// This is true when the effective signal peak is greater than the SDR +// reference white (1.0), taking into account `csp->hdr`. +PL_API bool pl_color_space_is_hdr(const struct pl_color_space *csp); + +// Returns whether or not a color space is "black scaled", in which case 0.0 is +// the true black point. This is true for SDR signals other than BT.1886, as +// well as for HLG. +PL_API bool pl_color_space_is_black_scaled(const struct pl_color_space *csp); + +struct pl_nominal_luma_params { + // The color space to infer luminance from + const struct pl_color_space *color; + + // Which type of metadata to draw values from + enum pl_hdr_metadata_type metadata; + + // This field controls the scaling of `out_*` + enum pl_hdr_scaling scaling; + + // Fields to write the detected nominal luminance to. (Optional) + // + // For SDR displays, this will default to a contrast level of 1000:1 unless + // indicated otherwise in the `min/max_luma` static HDR10 metadata fields. + float *out_min; + float *out_max; + + // Field to write the detected average luminance to, or 0.0 in the absence + // of dynamic metadata. (Optional) + float *out_avg; +}; + +#define pl_nominal_luma_params(...) \ + (&(struct pl_nominal_luma_params) { __VA_ARGS__ }) + +// Returns the effective luminance described by a pl_color_space. +PL_API void pl_color_space_nominal_luma_ex(const struct pl_nominal_luma_params *params); + +// Backwards compatibility wrapper for `pl_color_space_nominal_luma_ex` +PL_DEPRECATED PL_API void pl_color_space_nominal_luma(const struct pl_color_space *csp, + float *out_min, float *out_max); + +// Replaces unknown values in the first struct by those of the second struct. +PL_API void pl_color_space_merge(struct pl_color_space *orig, + const struct pl_color_space *update); + +// Returns whether two colorspaces are exactly identical. +PL_API bool pl_color_space_equal(const struct pl_color_space *c1, + const struct pl_color_space *c2); + +// Go through a color-space and explicitly default all unknown fields to +// reasonable values. After this function is called, none of the values will be +// PL_COLOR_*_UNKNOWN or 0.0, except for the dynamic HDR metadata fields. +PL_API void pl_color_space_infer(struct pl_color_space *space); + +// Like `pl_color_space_infer`, but takes default values from the reference +// color space (excluding certain special cases like HDR or wide gamut). +PL_API void pl_color_space_infer_ref(struct pl_color_space *space, + const struct pl_color_space *ref); + +// Infer both the source and destination gamut simultaneously, and also adjust +// values for optimal display. This is mostly the same as +// `pl_color_space_infer(src)` followed by `pl_color_space_infer_ref`, but also +// takes into account the SDR contrast levels and PQ black points. This is +// basically the logic used by `pl_shader_color_map` and `pl_renderer` to +// decide the output color space in a conservative way and compute the final +// end-to-end color transformation that needs to be done. +PL_API void pl_color_space_infer_map(struct pl_color_space *src, + struct pl_color_space *dst); + +// Some common color spaces. Note: These don't necessarily have all fields +// filled, in particular `hdr` is left unset. +PL_API extern const struct pl_color_space pl_color_space_unknown; +PL_API extern const struct pl_color_space pl_color_space_srgb; +PL_API extern const struct pl_color_space pl_color_space_bt709; +PL_API extern const struct pl_color_space pl_color_space_hdr10; +PL_API extern const struct pl_color_space pl_color_space_bt2020_hlg; +PL_API extern const struct pl_color_space pl_color_space_monitor; // typical display + +// This represents metadata about extra operations to perform during colorspace +// conversion, which correspond to artistic adjustments of the color. +struct pl_color_adjustment { + // Brightness boost. 0.0 = neutral, 1.0 = solid white, -1.0 = solid black + float brightness; + // Contrast boost. 1.0 = neutral, 0.0 = solid black + float contrast; + // Saturation gain. 1.0 = neutral, 0.0 = grayscale + float saturation; + // Hue shift, corresponding to a rotation around the [U, V] subvector, in + // radians. 0.0 = neutral + float hue; + // Gamma adjustment. 1.0 = neutral, 0.0 = solid black + float gamma; + // Color temperature shift. 0.0 = 6500 K, -1.0 = 3000 K, 1.0 = 10000 K + float temperature; +}; + +#define PL_COLOR_ADJUSTMENT_NEUTRAL \ + .contrast = 1.0, \ + .saturation = 1.0, \ + .gamma = 1.0, + +#define pl_color_adjustment(...) (&(struct pl_color_adjustment) { PL_COLOR_ADJUSTMENT_NEUTRAL __VA_ARGS__ }) +PL_API extern const struct pl_color_adjustment pl_color_adjustment_neutral; + +// Represents the chroma placement with respect to the luma samples. This is +// only relevant for YCbCr-like colorspaces with chroma subsampling. +enum pl_chroma_location { + PL_CHROMA_UNKNOWN = 0, + PL_CHROMA_LEFT, // MPEG2/4, H.264 + PL_CHROMA_CENTER, // MPEG1, JPEG + PL_CHROMA_TOP_LEFT, + PL_CHROMA_TOP_CENTER, + PL_CHROMA_BOTTOM_LEFT, + PL_CHROMA_BOTTOM_CENTER, + PL_CHROMA_COUNT, +}; + +// Fills *x and *y with the offset in luma pixels corresponding to a given +// chroma location. +// +// Note: PL_CHROMA_UNKNOWN defaults to PL_CHROMA_LEFT +PL_API void pl_chroma_location_offset(enum pl_chroma_location loc, float *x, float *y); + +// Returns an RGB->XYZ conversion matrix for a given set of primaries. +// Multiplying this into the RGB color transforms it to CIE XYZ, centered +// around the color space's white point. +PL_API pl_matrix3x3 pl_get_rgb2xyz_matrix(const struct pl_raw_primaries *prim); + +// Similar to pl_get_rgb2xyz_matrix, but gives the inverse transformation. +PL_API pl_matrix3x3 pl_get_xyz2rgb_matrix(const struct pl_raw_primaries *prim); + +// Returns a primary adaptation matrix, which converts from one set of +// primaries to another. This is an RGB->RGB transformation. For rendering +// intents other than PL_INTENT_ABSOLUTE_COLORIMETRIC, the white point is +// adapted using the Bradford matrix. +PL_API pl_matrix3x3 pl_get_color_mapping_matrix(const struct pl_raw_primaries *src, + const struct pl_raw_primaries *dst, + enum pl_rendering_intent intent); + +// Return a chromatic adaptation matrix, which converts from one white point to +// another, using the Bradford matrix. This is an RGB->RGB transformation. +PL_API pl_matrix3x3 pl_get_adaptation_matrix(struct pl_cie_xy src, struct pl_cie_xy dst); + +// Returns true if 'b' is entirely contained in 'a'. Useful for figuring out if +// colorimetric clipping will occur or not. +PL_API bool pl_primaries_superset(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b); + +// Returns true if `prim` forms a nominally valid set of primaries. This does +// not check whether or not these primaries are actually physically realisable, +// merely that they satisfy the requirements for colorspace math (to avoid NaN). +PL_API bool pl_primaries_valid(const struct pl_raw_primaries *prim); + +// Returns true if two primaries are 'compatible', which is the case if +// they preserve the relationship between primaries (red=red, green=green, +// blue=blue). In other words, this is false for synthetic primaries that have +// channels misordered from the convention (e.g. for some test ICC profiles). +PL_API bool pl_primaries_compatible(const struct pl_raw_primaries *a, + const struct pl_raw_primaries *b); + +// Clip points in the first gamut (src) to be fully contained inside the second +// gamut (dst). Only works on compatible primaries (pl_primaries_compatible). +PL_API struct pl_raw_primaries +pl_primaries_clip(const struct pl_raw_primaries *src, + const struct pl_raw_primaries *dst); + +// Primary-dependent RGB->LMS matrix for the IPTPQc4 color system. This is +// derived from the HPE XYZ->LMS matrix with 4% crosstalk added. +PL_API pl_matrix3x3 pl_ipt_rgb2lms(const struct pl_raw_primaries *prim); +PL_API pl_matrix3x3 pl_ipt_lms2rgb(const struct pl_raw_primaries *prim); + +// Primary-independent L'M'S' -> IPT matrix for the IPTPQc4 color system, and +// its inverse. This is identical to the Ebner & Fairchild (1998) IPT matrix. +PL_API extern const pl_matrix3x3 pl_ipt_lms2ipt; +PL_API extern const pl_matrix3x3 pl_ipt_ipt2lms; + +// Cone types involved in human vision +enum pl_cone { + PL_CONE_L = 1 << 0, + PL_CONE_M = 1 << 1, + PL_CONE_S = 1 << 2, + + // Convenience aliases + PL_CONE_NONE = 0, + PL_CONE_LM = PL_CONE_L | PL_CONE_M, + PL_CONE_MS = PL_CONE_M | PL_CONE_S, + PL_CONE_LS = PL_CONE_L | PL_CONE_S, + PL_CONE_LMS = PL_CONE_L | PL_CONE_M | PL_CONE_S, +}; + +// Structure describing parameters for simulating color blindness +struct pl_cone_params { + enum pl_cone cones; // Which cones are *affected* by the vision model + float strength; // Coefficient for how strong the defect is + // (1.0 = Unaffected, 0.0 = Full blindness) +}; + +#define pl_cone_params(...) (&(struct pl_cone_params) { __VA_ARGS__ }) + +// Built-in color blindness models +PL_API extern const struct pl_cone_params pl_vision_normal; // No distortion (92%) +PL_API extern const struct pl_cone_params pl_vision_protanomaly; // Red deficiency (0.66%) +PL_API extern const struct pl_cone_params pl_vision_protanopia; // Red absence (0.59%) +PL_API extern const struct pl_cone_params pl_vision_deuteranomaly; // Green deficiency (2.7%) +PL_API extern const struct pl_cone_params pl_vision_deuteranopia; // Green absence (0.56%) +PL_API extern const struct pl_cone_params pl_vision_tritanomaly; // Blue deficiency (0.01%) +PL_API extern const struct pl_cone_params pl_vision_tritanopia; // Blue absence (0.016%) +PL_API extern const struct pl_cone_params pl_vision_monochromacy; // Blue cones only (<0.001%) +PL_API extern const struct pl_cone_params pl_vision_achromatopsia; // Rods only (<0.0001%) + +// Returns a cone adaptation matrix. Applying this to an RGB color in the given +// color space will apply the given cone adaptation coefficients for simulating +// a type of color blindness. +// +// For the color blindness models which don't entail complete loss of a cone, +// you can partially counteract the effect by using a similar model with the +// `strength` set to its inverse. For example, to partially counteract +// deuteranomaly, you could generate a cone matrix for PL_CONE_M with the +// strength 2.0 (or some other number above 1.0). +PL_API pl_matrix3x3 pl_get_cone_matrix(const struct pl_cone_params *params, + const struct pl_raw_primaries *prim); + +// Returns a color decoding matrix for a given combination of source color +// representation and adjustment parameters. This mutates `repr` to reflect the +// change. If `params` is NULL, it defaults to &pl_color_adjustment_neutral. +// +// This function always performs a conversion to RGB. To convert to other +// colorspaces (e.g. between YUV systems), obtain a second YUV->RGB matrix +// and invert it using `pl_transform3x3_invert`. +// +// Note: For BT.2020 constant-luminance, this outputs chroma information in the +// range [-0.5, 0.5]. Since the CL system conversion is non-linear, further +// processing must be done by the caller. The channel order is CrYCb. +// +// Note: For BT.2100 ICtCp, this outputs in the color space L'M'S'. Further +// non-linear processing must be done by the caller. +// +// Note: XYZ system is expected to be in DCDM X'Y'Z' encoding (ST 428-1), in +// practice this means normalizing by (48.0 / 52.37) factor and applying 2.6 gamma +PL_API pl_transform3x3 pl_color_repr_decode(struct pl_color_repr *repr, + const struct pl_color_adjustment *params); + +// Common struct to describe an ICC profile +struct pl_icc_profile { + // Points to the in-memory representation of the ICC profile. This is + // allowed to be NULL, in which case the `pl_icc_profile` represents "no + // profile”. + const void *data; + size_t len; + + // If a profile is set, this signature must uniquely identify it (including + // across restarts, for caching), ideally using a checksum of the profile + // contents. The user is free to choose the method of determining this + // signature, but note the existence of the + // `pl_icc_profile_compute_signature` helper. + uint64_t signature; +}; + +#define pl_icc_profile(...) &(struct pl_icc_profile) { __VA_ARGS__ } + +// This doesn't do a comparison of the actual contents, only of the signature. +PL_API bool pl_icc_profile_equal(const struct pl_icc_profile *p1, + const struct pl_icc_profile *p2); + +// Sets `signature` to a hash of `profile->data`, if non-NULL. Provided as a +// convenience function for the sake of users ingesting arbitrary ICC profiles +// from sources where they can't reliably detect profile changes. +// +// Note: This is based on a very fast hash, and will compute a signature for +// even large (10 MB) ICC profiles in, typically, a fraction of a millisecond. +PL_API void pl_icc_profile_compute_signature(struct pl_icc_profile *profile); + +PL_API_END + +#endif // LIBPLACEBO_COLORSPACE_H_ diff --git a/src/include/libplacebo/common.h b/src/include/libplacebo/common.h new file mode 100644 index 0000000..806730c --- /dev/null +++ b/src/include/libplacebo/common.h @@ -0,0 +1,244 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_COMMON_H_ +#define LIBPLACEBO_COMMON_H_ + +#include <stdbool.h> + +#include <libplacebo/config.h> + +PL_API_BEGIN + +// Some common utility types. These are overloaded to support 2D, 3D and +// integer/float variants. +typedef struct pl_rect2d { + int x0, y0; + int x1, y1; +} pl_rect2d; + +typedef struct pl_rect3d { + int x0, y0, z0; + int x1, y1, z1; +} pl_rect3d; + +typedef struct pl_rect2df { + float x0, y0; + float x1, y1; +} pl_rect2df; + +typedef struct pl_rect3df { + float x0, y0, z0; + float x1, y1, z1; +} pl_rect3df; + +// These macros will work for any of the above pl_rect variants (with enough +// dimensions). Careful: double-evaluation hazard +#define pl_rect_w(r) ((r).x1 - (r).x0) +#define pl_rect_h(r) ((r).y1 - (r).y0) +#define pl_rect_d(r) ((r).z1 - (r).z0) + +#define pl_rect2d_eq(a, b) \ + ((a).x0 == (b).x0 && (a).x1 == (b).x1 && \ + (a).y0 == (b).y0 && (a).y1 == (b).y1) + +#define pl_rect3d_eq(a, b) \ + ((a).x0 == (b).x0 && (a).x1 == (b).x1 && \ + (a).y0 == (b).y0 && (a).y1 == (b).y1 && \ + (a).z0 == (b).z0 && (a).z1 == (b).z1) + +// "Normalize" a rectangle: This ensures d1 >= d0 for all dimensions. +PL_API void pl_rect2d_normalize(pl_rect2d *rc); +PL_API void pl_rect3d_normalize(pl_rect3d *rc); + +PL_API void pl_rect2df_normalize(pl_rect2df *rc); +PL_API void pl_rect3df_normalize(pl_rect3df *rc); + +// Return the rounded form of a rect. +PL_API pl_rect2d pl_rect2df_round(const pl_rect2df *rc); +PL_API pl_rect3d pl_rect3df_round(const pl_rect3df *rc); + +// Represents a row-major matrix, i.e. the following matrix +// [ a11 a12 a13 ] +// [ a21 a22 a23 ] +// [ a31 a32 a33 ] +// is represented in C like this: +// { { a11, a12, a13 }, +// { a21, a22, a23 }, +// { a31, a32, a33 } }; +typedef struct pl_matrix3x3 { + float m[3][3]; +} pl_matrix3x3; + +PL_API extern const pl_matrix3x3 pl_matrix3x3_identity; + +// Applies a matrix to a float vector in-place. +PL_API void pl_matrix3x3_apply(const pl_matrix3x3 *mat, float vec[3]); + +// Applies a matrix to a pl_rect3df +PL_API void pl_matrix3x3_apply_rc(const pl_matrix3x3 *mat, pl_rect3df *rc); + +// Scales a color matrix by a linear factor. +PL_API void pl_matrix3x3_scale(pl_matrix3x3 *mat, float scale); + +// Inverts a matrix. Only use where precision is not that important. +PL_API void pl_matrix3x3_invert(pl_matrix3x3 *mat); + +// Composes/multiplies two matrices. Multiples B into A, i.e. +// A := A * B +PL_API void pl_matrix3x3_mul(pl_matrix3x3 *a, const pl_matrix3x3 *b); + +// Flipped version of `pl_matrix3x3_mul`. +// B := A * B +PL_API void pl_matrix3x3_rmul(const pl_matrix3x3 *a, pl_matrix3x3 *b); + +// Represents an affine transformation, which is basically a 3x3 matrix +// together with a column vector to add onto the output. +typedef struct pl_transform3x3 { + pl_matrix3x3 mat; + float c[3]; +} pl_transform3x3; + +PL_API extern const pl_transform3x3 pl_transform3x3_identity; + +// Applies a transform to a float vector in-place. +PL_API void pl_transform3x3_apply(const pl_transform3x3 *t, float vec[3]); + +// Applies a transform to a pl_rect3df +PL_API void pl_transform3x3_apply_rc(const pl_transform3x3 *t, pl_rect3df *rc); + +// Scales the output of a transform by a linear factor. Since an affine +// transformation is non-linear, this does not commute. If you want to scale +// the *input* of a transform, use pl_matrix3x3_scale on `t.mat`. +PL_API void pl_transform3x3_scale(pl_transform3x3 *t, float scale); + +// Inverts a transform. Only use where precision is not that important. +PL_API void pl_transform3x3_invert(pl_transform3x3 *t); + +// 2D analog of the above structs. Since these are featured less prominently, +// we omit some of the other helper functions. +typedef struct pl_matrix2x2 { + float m[2][2]; +} pl_matrix2x2; + +PL_API extern const pl_matrix2x2 pl_matrix2x2_identity; +PL_API pl_matrix2x2 pl_matrix2x2_rotation(float angle); + +PL_API void pl_matrix2x2_apply(const pl_matrix2x2 *mat, float vec[2]); +PL_API void pl_matrix2x2_apply_rc(const pl_matrix2x2 *mat, pl_rect2df *rc); + +PL_API void pl_matrix2x2_mul(pl_matrix2x2 *a, const pl_matrix2x2 *b); +PL_API void pl_matrix2x2_rmul(const pl_matrix2x2 *a, pl_matrix2x2 *b); + +PL_API void pl_matrix2x2_scale(pl_matrix2x2 *mat, float scale); +PL_API void pl_matrix2x2_invert(pl_matrix2x2 *mat); + +typedef struct pl_transform2x2 { + pl_matrix2x2 mat; + float c[2]; +} pl_transform2x2; + +PL_API extern const pl_transform2x2 pl_transform2x2_identity; + +PL_API void pl_transform2x2_apply(const pl_transform2x2 *t, float vec[2]); +PL_API void pl_transform2x2_apply_rc(const pl_transform2x2 *t, pl_rect2df *rc); + +PL_API void pl_transform2x2_mul(pl_transform2x2 *a, const pl_transform2x2 *b); +PL_API void pl_transform2x2_rmul(const pl_transform2x2 *a, pl_transform2x2 *b); + +PL_API void pl_transform2x2_scale(pl_transform2x2 *t, float scale); +PL_API void pl_transform2x2_invert(pl_transform2x2 *t); + +// Compute new bounding box of a transformation (as applied to a given rect). +PL_API pl_rect2df pl_transform2x2_bounds(const pl_transform2x2 *t, + const pl_rect2df *rc); + +// Helper functions for dealing with aspect ratios and stretched/scaled rects. + +// Return the (absolute) aspect ratio (width/height) of a given pl_rect2df. +// This will always be a positive number, even if `rc` is flipped. +PL_API float pl_rect2df_aspect(const pl_rect2df *rc); + +// Set the aspect of a `rc` to a given aspect ratio with an extra 'panscan' +// factor choosing the balance between shrinking and growing the `rc` to meet +// this aspect ratio. +// +// Notes: +// - If `panscan` is 0.0, this function will only ever shrink the `rc`. +// - If `panscan` is 1.0, this function will only ever grow the `rc`. +// - If `panscan` is 0.5, this function is area-preserving. +PL_API void pl_rect2df_aspect_set(pl_rect2df *rc, float aspect, float panscan); + +// Set one rect's aspect to that of another +#define pl_rect2df_aspect_copy(rc, src, panscan) \ + pl_rect2df_aspect_set((rc), pl_rect2df_aspect(src), (panscan)) + +// 'Fit' one rect inside another. `rc` will be set to the same size and aspect +// ratio as `src`, but with the size limited to fit inside the original `rc`. +// Like `pl_rect2df_aspect_set`, `panscan` controls the pan&scan factor. +PL_API void pl_rect2df_aspect_fit(pl_rect2df *rc, const pl_rect2df *src, float panscan); + +// Scale rect in each direction while keeping it centered. +PL_API void pl_rect2df_stretch(pl_rect2df *rc, float stretch_x, float stretch_y); + +// Offset rect by an arbitrary offset factor. If the corresponding dimension +// of a rect is flipped, so too is the applied offset. +PL_API void pl_rect2df_offset(pl_rect2df *rc, float offset_x, float offset_y); + +// Scale a rect uniformly in both dimensions. +#define pl_rect2df_zoom(rc, zoom) pl_rect2df_stretch((rc), (zoom), (zoom)) + +// Rotation in degrees clockwise +typedef int pl_rotation; +enum { + PL_ROTATION_0 = 0, + PL_ROTATION_90 = 1, + PL_ROTATION_180 = 2, + PL_ROTATION_270 = 3, + PL_ROTATION_360 = 4, // equivalent to PL_ROTATION_0 + + // Note: Values outside the range [0,4) are legal, including negatives. +}; + +// Constrains to the interval [PL_ROTATION_0, PL_ROTATION_360). +static inline pl_rotation pl_rotation_normalize(pl_rotation rot) +{ + return (rot % PL_ROTATION_360 + PL_ROTATION_360) % PL_ROTATION_360; +} + +// Rotates the coordinate system of a `pl_rect2d(f)` in a certain direction. +// For example, calling this with PL_ROTATION_90 will correspond to rotating +// the coordinate system 90° to the right (so the x axis becomes the y axis). +// +// The resulting rect is re-normalized in the same coordinate system. +PL_API void pl_rect2df_rotate(pl_rect2df *rc, pl_rotation rot); + +// Returns the aspect ratio in a rotated frame of reference. +static inline float pl_aspect_rotate(float aspect, pl_rotation rot) +{ + return (rot % PL_ROTATION_180) ? 1.0 / aspect : aspect; +} + +#define pl_rect2df_aspect_set_rot(rc, aspect, rot, panscan) \ + pl_rect2df_aspect_set((rc), pl_aspect_rotate((aspect), (rot)), (panscan)) + +#define pl_rect2df_aspect_copy_rot(rc, src, panscan, rot) \ + pl_rect2df_aspect_set_rot((rc), pl_rect2df_aspect(src), (rot), (panscan)) + +PL_API_END + +#endif // LIBPLACEBO_COMMON_H_ diff --git a/src/include/libplacebo/config.h.in b/src/include/libplacebo/config.h.in new file mode 100644 index 0000000..2ed6290 --- /dev/null +++ b/src/include/libplacebo/config.h.in @@ -0,0 +1,102 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_CONFIG_H_ +#define LIBPLACEBO_CONFIG_H_ + +// Increased any time the library changes in a fundamental/major way. +#define PL_MAJOR_VER @majorver@ + +// Increased any time the API changes. (Note: Does not reset when PL_MAJOR_VER +// is increased) +#define PL_API_VER @apiver@ + +// Increased any time a fix is made to a given API version. +#define PL_FIX_VER (pl_fix_ver()) + +// Friendly name (`git describe`) for the overall version of the library +#define PL_VERSION (pl_version()) + +// Feature tests. These aren't described in further detail, but may be useful +// for programmers wanting to programmatically check for feature support +// in their compiled libplacebo versions. +@extra_defs@ + +// Extra compiler-specific stuff +#ifndef PL_DEPRECATED +# if defined(_MSC_VER) +# define PL_DEPRECATED +# else +# define PL_DEPRECATED __attribute__((deprecated)) +# endif +#endif + +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + +#ifndef PL_DEPRECATED_ENUMERATOR +# if (defined(__GNUC__) && (__GNUC__ >= 6)) || __has_feature(enumerator_attributes) +# define PL_DEPRECATED_ENUMERATOR PL_DEPRECATED +# else +# define PL_DEPRECATED_ENUMERATOR +# endif +#endif + +#if defined(_WIN32) || defined(__CYGWIN__) +# ifdef PL_EXPORT +# define PL_API __declspec(dllexport) +# else +# ifndef PL_STATIC +# define PL_API __declspec(dllimport) +# else +# define PL_API +# endif +# endif +#else +# define PL_API __attribute__ ((visibility ("default"))) +#endif + +// C++ compatibility +#ifdef __cplusplus +# define PL_API_BEGIN extern "C" { +# define PL_API_END } +#else +# define PL_API_BEGIN +# define PL_API_END +#endif + +#ifndef __cplusplus +// Disable this warning because libplacebo's params macros override fields +# pragma GCC diagnostic ignored "-Woverride-init" +#endif + +// Extra helper macros +#define PL_TOSTRING_INNER(x) #x +#define PL_TOSTRING(x) PL_TOSTRING_INNER(x) + +// Deprecated macro for back-compatibility +#define PL_STRUCT(name) struct name##_t + +PL_API_BEGIN + +PL_API int pl_fix_ver(void); +PL_API const char *pl_version(void); + +PL_API_END + +#endif // LIBPLACEBO_CONFIG_H_ diff --git a/src/include/libplacebo/d3d11.h b/src/include/libplacebo/d3d11.h new file mode 100644 index 0000000..8ecba30 --- /dev/null +++ b/src/include/libplacebo/d3d11.h @@ -0,0 +1,248 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_D3D11_H_ +#define LIBPLACEBO_D3D11_H_ + +#include <windows.h> +#include <d3d11.h> +#include <dxgi1_2.h> +#include <libplacebo/gpu.h> +#include <libplacebo/swapchain.h> + +PL_API_BEGIN + +// Structure representing the actual D3D11 device and associated GPU instance +typedef const struct pl_d3d11_t { + pl_gpu gpu; + + // The D3D11 device in use. The user is free to use this for their own + // purposes, including taking a reference to the device (with AddRef) and + // using it beyond the lifetime of the pl_d3d11 that created it (though if + // this is done with debug enabled, it will confuse the leak checker.) + ID3D11Device *device; + + // True if the device is using a software (WARP) adapter + bool software; +} *pl_d3d11; + +struct pl_d3d11_params { + // The Direct3D 11 device to use. Optional, if NULL then libplacebo will + // create its own ID3D11Device using the options below. If set, all the + // options below will be ignored. + ID3D11Device *device; + + // --- Adapter selection options + + // The adapter to use. This overrides adapter_luid. + IDXGIAdapter *adapter; + + // The LUID of the adapter to use. If adapter and adapter_luid are unset, + // the default adapter will be used instead. + LUID adapter_luid; + + // Allow a software (WARP) adapter when selecting the adapter automatically. + // Note that sometimes the default adapter will be a software adapter. This + // is because, on Windows 8 and up, if there are no hardware adapters, + // Windows will pretend the WARP adapter is the default hardware adapter. + bool allow_software; + + // Always use a software adapter. This is mainly for testing purposes. + bool force_software; + + // --- Device creation options + + // Enable the debug layer (D3D11_CREATE_DEVICE_DEBUG) + // Also logs IDXGIInfoQueue messages + bool debug; + + // Extra flags to pass to D3D11CreateDevice (D3D11_CREATE_DEVICE_FLAG). + // libplacebo should be compatible with any flags passed here. + UINT flags; + + // The minimum and maximum allowable feature levels for the created device. + // libplacebo will attempt to create a device with the highest feature level + // between min_feature_level and max_feature_level (inclusive.) If there are + // no supported feature levels in this range, `pl_d3d11_create` will either + // return NULL or fall back to the software adapter, depending on whether + // `allow_software` is set. + // + // Normally there is no reason to set `max_feature_level` other than to test + // if a program works at lower feature levels. + // + // Note that D3D_FEATURE_LEVEL_9_3 and below (known as 10level9) are highly + // restrictive. These feature levels are supported on a best-effort basis. + // They represent very old DirectX 9 compatible PC and laptop hardware + // (2001-2007, GeForce FX, 6, 7, ATI R300-R500, GMA 950-X3000) and some + // less-old mobile devices (Surface RT, Surface 2.) Basic video rendering + // should work, but the full pl_gpu API will not be available and advanced + // shaders will probably fail. The hardware is probably too slow for these + // anyway. + // + // Known restrictions of 10level9 devices include: + // D3D_FEATURE_LEVEL_9_3 and below: + // - `pl_pass_run_params->index_buf` will not work (but `index_data` will) + // - Dimensions of 3D textures must be powers of two + // - Shaders cannot use gl_FragCoord + // - Shaders cannot use texelFetch + // D3D_FEATURE_LEVEL_9_2 and below: + // - Fragment shaders have no dynamic flow control and very strict limits + // on the number of constants, temporary registers and instructions. + // Whether a shader meets the requirements will depend on how it's + // compiled and optimized, but it's likely that only simple shaders will + // work. + // D3D_FEATURE_LEVEL_9_1: + // - No high-bit-depth formats with PL_FMT_CAP_RENDERABLE or + // PL_FMT_CAP_LINEAR + // + // If these restrictions are undesirable and you don't need to support + // ancient hardware, set `min_feature_level` to D3D_FEATURE_LEVEL_10_0. + int min_feature_level; // Defaults to D3D_FEATURE_LEVEL_9_1 if unset + int max_feature_level; // Defaults to D3D_FEATURE_LEVEL_12_1 if unset + + // Allow up to N in-flight frames. Similar to swapchain_depth for Vulkan and + // OpenGL, though with DXGI this is a device-wide setting that affects all + // swapchains (except for waitable swapchains.) See the documentation for + // `pl_swapchain_latency` for more information. + int max_frame_latency; +}; + +// Default/recommended parameters. Should generally be safe and efficient. +#define PL_D3D11_DEFAULTS \ + .allow_software = true, + +#define pl_d3d11_params(...) (&(struct pl_d3d11_params) { PL_D3D11_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_d3d11_params pl_d3d11_default_params; + +// Creates a new Direct3D 11 device based on the given parameters, or wraps an +// existing device, and initializes a new GPU instance. If params is left as +// NULL, it defaults to &pl_d3d11_default_params. If an existing device is +// provided in params->device, `pl_d3d11_create` will take a reference to it +// that will be released in `pl_d3d11_destroy`. +PL_API pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params); + +// Release the D3D11 device. +// +// Note that all libplacebo objects allocated from this pl_d3d11 object (e.g. +// via `d3d11->gpu` or using `pl_d3d11_create_swapchain`) *must* be explicitly +// destroyed by the user before calling this. +PL_API void pl_d3d11_destroy(pl_d3d11 *d3d11); + +// For a `pl_gpu` backed by `pl_d3d11`, this function can be used to retrieve +// the underlying `pl_d3d11`. Returns NULL for any other type of `gpu`. +PL_API pl_d3d11 pl_d3d11_get(pl_gpu gpu); + +struct pl_d3d11_swapchain_params { + // The Direct3D 11 swapchain to wrap. Optional. If NULL, libplacebo will + // create its own swapchain using the options below. If set, all the + // swapchain creation options will be ignored. + // + // The provided swapchain must have been created by the same device used + // by `gpu` and must not have multisampled backbuffers. + IDXGISwapChain *swapchain; + + // --- Swapchain creation options + + // Initial framebuffer width and height. If both width and height are set to + // 0 and window is non-NULL, the client area of the window is used instead. + // For convenience, if either component would be 0, it is set to 1 instead. + // This is because Windows can have 0-sized windows, but not 0-sized + // swapchains. + int width; + int height; + + // The handle of the output window. In Windows 8 and up this is optional + // because you can output to a CoreWindow or create a composition swapchain + // instead. + HWND window; + + // A pointer to the CoreWindow to output to. If both this and `window` are + // NULL, CreateSwapChainForComposition will be used to create the swapchain. + IUnknown *core_window; + + // If set, libplacebo will create a swapchain that uses the legacy bitblt + // presentation model (with the DXGI_SWAP_EFFECT_DISCARD swap effect.) This + // tends to give worse performance and frame pacing in windowed mode and it + // prevents borderless fullscreen optimizations, but it might be necessary + // to work around buggy drivers, especially with DXGI 1.2 in the Platform + // Update for Windows 7. When unset, libplacebo will try to use the flip + // presentation model and only fall back to bitblt if flip is unavailable. + bool blit; + + // additional swapchain flags + // No validation on these flags is being performed, and swapchain creation + // may fail if an unsupported combination is requested. + UINT flags; + + // --- Swapchain usage behavior options + + // Disable using a 10-bit swapchain format for SDR output + bool disable_10bit_sdr; +}; + +#define pl_d3d11_swapchain_params(...) (&(struct pl_d3d11_swapchain_params) { __VA_ARGS__ }) + +// Creates a new Direct3D 11 swapchain, or wraps an existing one. If an existing +// swapchain is provided in params->swapchain, `pl_d3d11_create_swapchain` will +// take a reference to it that will be released in `pl_swapchain_destroy`. +PL_API pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11, + const struct pl_d3d11_swapchain_params *params); + +// Takes a `pl_swapchain` created by pl_d3d11_create_swapchain and returns a +// reference to the underlying IDXGISwapChain. This increments the refcount, so +// call IDXGISwapChain::Release when finished with it. +PL_API IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw); + +struct pl_d3d11_wrap_params { + // The D3D11 texture to wrap, or a texture array containing the texture to + // wrap. Must be a ID3D11Texture1D, ID3D11Texture2D or ID3D11Texture3D + // created by the same device used by `gpu`, must have D3D11_USAGE_DEFAULT, + // and must not be mipmapped or multisampled. + ID3D11Resource *tex; + + // If tex is a texture array, this is the array member to use as the pl_tex. + int array_slice; + + // If tex is a video resource (eg. DXGI_FORMAT_AYUV, DXGI_FORMAT_NV12, + // DXGI_FORMAT_P010, etc.,) it can be wrapped as a pl_tex by specifying the + // type and size of the shader view. For planar video formats, the plane + // that is wrapped depends on the chosen format. + // + // If tex is not a video resource, these fields are unnecessary. The correct + // format will be determined automatically. If tex is not 2D, these fields + // are ignored. + // + // For a list of supported video formats and their corresponding view + // formats and sizes, see: + // https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#VideoViews + DXGI_FORMAT fmt; + int w; + int h; +}; + +#define pl_d3d11_wrap_params(...) (&(struct pl_d3d11_wrap_params) { __VA_ARGS__ }) + +// Wraps an external texture into a pl_tex abstraction. `pl_d3d11_wrap` takes a +// reference to the texture, which is released when `pl_tex_destroy` is called. +// +// This function may fail due to incompatible formats, incompatible flags or +// other reasons, in which case it will return NULL. +PL_API pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params); + +PL_API_END + +#endif // LIBPLACEBO_D3D11_H_ diff --git a/src/include/libplacebo/dispatch.h b/src/include/libplacebo/dispatch.h new file mode 100644 index 0000000..7d43794 --- /dev/null +++ b/src/include/libplacebo/dispatch.h @@ -0,0 +1,239 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_DISPATCH_H_ +#define LIBPLACEBO_DISPATCH_H_ + +#include <libplacebo/shaders.h> +#include <libplacebo/gpu.h> + +PL_API_BEGIN + +// Thread-safety: Safe +typedef struct pl_dispatch_t *pl_dispatch; + +// Creates a new shader dispatch object. This object provides a translation +// layer between generated shaders (pl_shader) and the ra context such that it +// can be used to execute shaders. This dispatch object will also provide +// shader caching (for efficient re-use). +PL_API pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu); +PL_API void pl_dispatch_destroy(pl_dispatch *dp); + +// Reset/increments the internal counters of the pl_dispatch. This must be +// called whenever the user is going to begin with a new frame, in order to +// perform garbage collection and advance the state of the internal PRNG. +// +// Note that shaders generated by `pl_dispatch` are therefore entirely +// deterministic, as long as the sequence of calls (and inputs to the shader) +// are the same. +PL_API void pl_dispatch_reset_frame(pl_dispatch dp); + +// Returns a blank pl_shader object, suitable for recording rendering commands. +// For more information, see the header documentation in `shaders/*.h`. +PL_API pl_shader pl_dispatch_begin(pl_dispatch dp); + +// Struct passed to `info_callback`. Only valid until that function returns. +struct pl_dispatch_info { + // Information about the shader for this shader execution, as well as a + // 64-bit signature uniquely identifying it. + pl_shader_info shader; + uint64_t signature; + + // A list of execution times for this pass, in nanoseconds. May be empty. + uint64_t samples[256]; + int num_samples; + + // As a convenience, this contains the last, average and peak of the above + // list of samples. If `num_samples` is 0, these values are also 0. + uint64_t last; + uint64_t peak; + uint64_t average; +}; + +// Helper function to make a copy of `pl_dispatch_info`, while overriding +// (and dereferencing) whatever was previously stored there. +static inline void pl_dispatch_info_move(struct pl_dispatch_info *dst, + const struct pl_dispatch_info *src) +{ + pl_shader_info_deref(&dst->shader); + *dst = *src; + dst->shader = pl_shader_info_ref(src->shader); +} + +// Set up a dispatch callback for this `pl_dispatch` object. The given callback +// will be run for every successfully dispatched shader. Call this again with +// `cb == NULL` to disable. +PL_API void pl_dispatch_callback(pl_dispatch dp, void *priv, + void (*cb)(void *priv, + const struct pl_dispatch_info *)); + +struct pl_dispatch_params { + // The shader to execute. The pl_dispatch will take over ownership + // of this shader, and return it back to the internal pool. + // + // This shader must have a compatible signature, i.e. inputs + // `PL_SHADER_SIG_NONE` and outputs `PL_SHADER_SIG_COLOR`. + pl_shader *shader; + + // The texture to render to. This must have params compatible with the + // shader, i.e. `target->params.renderable` for fragment shaders and + // `target->params.storable` for compute shaders. + // + // Note: Even when not using compute shaders, users are advised to always + // set `target->params.storable` if permitted by the `pl_fmt`, since this + // allows the use of compute shaders instead of full-screen quads, which is + // faster on some platforms. + pl_tex target; + + // The target rect to render to. Optional, if left as {0}, then the + // entire texture will be rendered to. + pl_rect2d rect; + + // If set, enables and controls the blending for this pass. Optional. When + // using this with fragment shaders, `target->params.fmt->caps` must + // include `PL_FMT_CAP_BLENDABLE`. + const struct pl_blend_params *blend_params; + + // If set, records the execution time of this dispatch into the given + // timer object. Optional. + // + // Note: If this is set, `pl_dispatch` cannot internally measure the + // execution time of the shader, which means `pl_dispatch_info.samples` may + // be empty as a result. + pl_timer timer; +}; + +#define pl_dispatch_params(...) (&(struct pl_dispatch_params) { __VA_ARGS__ }) + +// Dispatch a generated shader (via the pl_shader mechanism). Returns whether +// or not the dispatch was successful. +PL_API bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params); + +struct pl_dispatch_compute_params { + // The shader to execute. This must be a compute shader with the input + // set to PL_SHADER_SIG_NONE. The output, if it has any, is ignored. + pl_shader *shader; + + // The number of work groups to dispatch in each dimension. If this is left + // as [0} and `width/height` are both set, the number of work groups will + // be inferred from the shader's `compute_group_sizes`. + int dispatch_size[3]; + + // If set, simulate vertex attributes (similar to `pl_dispatch_finish`) + // according to the given dimensions. The first two components of the + // thread's global ID will be interpreted as the X and Y locations. + // + // Optional, ignored if either component is left as 0. + int width, height; + + // If set, records the execution time of this dispatch into the given + // timer object. Optional. + // + // Note: If this is set, `pl_dispatch` cannot internally measure the + // execution time of the shader, which means `pl_dispatch_info.samples` may + // be empty as a result. + pl_timer timer; +}; + +#define pl_dispatch_compute_params(...) (&(struct pl_dispatch_compute_params) { __VA_ARGS__ }) + +// A variant of `pl_dispatch_finish`, this one only dispatches a compute shader +// while ignoring its output (if it has one). It's only useful for shaders +// which have otherwise observable side effects (such as updating state +// objects). +PL_API bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params); + +enum pl_vertex_coords { + PL_COORDS_ABSOLUTE, // Absolute/integer `target` coordinates + PL_COORDS_RELATIVE, // Relative `target` coordinates in range [0, 1] + PL_COORDS_NORMALIZED, // GL-normalized coordinates in range [-1, 1] +}; + +struct pl_dispatch_vertex_params { + // The shader to execute. This must be a raster shader with the input set + // to `PL_SHADER_SIG_NONE` and the output set to `PL_SHADER_SIG_COLOR`. + // + // Additionally, the shader must not have any attached vertex attributes. + pl_shader *shader; + + // The texture to render to. Requires `target->params.renderable`. + pl_tex target; + + // The target rect to clip the rendering to. (Optional) + pl_rect2d scissors; + + // If set, enables and controls the blending for this pass. Optional. When + // enabled, `target->params.fmt->caps` must include `PL_FMT_CAP_BLENDABLE`. + const struct pl_blend_params *blend_params; + + // The description of the vertex format, including offsets. + // + // Note: `location` is ignored and can safely be left unset. + const struct pl_vertex_attrib *vertex_attribs; + int num_vertex_attribs; + size_t vertex_stride; + + // The index of the vertex position in `vertex_attribs`, as well as the + // interpretation of its contents. + int vertex_position_idx; + enum pl_vertex_coords vertex_coords; + bool vertex_flipped; // flip all vertex y coordinates + + // Type and number of vertices to render. + enum pl_prim_type vertex_type; + int vertex_count; + + // Vertex data. See `pl_pass_run_params.vertex_data`. + const void *vertex_data; + pl_buf vertex_buf; + size_t buf_offset; + + // Index data. See `pl_pass_run_params.index_data`. Optional. + const void *index_data; + enum pl_index_format index_fmt; + pl_buf index_buf; + size_t index_offset; + + // If set, records the execution time of this dispatch into the given + // timer object. Optional. + // + // Note: If this is set, `pl_dispatch` cannot internally measure the + // execution time of the shader, which means `pl_dispatch_info.samples` may + // be empty as a result. + pl_timer timer; +}; + +#define pl_dispatch_vertex_params(...) (&(struct pl_dispatch_vertex_params) { __VA_ARGS__ }) + +// Dispatch a generated shader using custom vertices, rather than using a quad +// generated by the dispatch. This allows the use of e.g. custom fragment +// shaders for things like rendering custom UI elements, or possibly doing +// advanced things like sampling from a cube map or spherical video. +PL_API bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params); + +// Cancel an active shader without submitting anything. Useful, for example, +// if the shader was instead merged into a different shader. +PL_API void pl_dispatch_abort(pl_dispatch dp, pl_shader *sh); + +// Deprecated in favor of `pl_cache_save/pl_cache_load` on the `pl_cache` +// associated with the `pl_gpu` this dispatch is using. +PL_DEPRECATED PL_API size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out_cache); +PL_DEPRECATED PL_API void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache); + +PL_API_END + +#endif // LIBPLACEBO_DISPATCH_H diff --git a/src/include/libplacebo/dither.h b/src/include/libplacebo/dither.h new file mode 100644 index 0000000..84f17c7 --- /dev/null +++ b/src/include/libplacebo/dither.h @@ -0,0 +1,82 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_DITHER_H_ +#define LIBPLACEBO_DITHER_H_ + +#include <libplacebo/common.h> + +PL_API_BEGIN + +// Generates a deterministic NxN bayer (ordered) dither matrix, storing the +// result in `data`. `size` must be a power of two. The resulting matrix will +// be roughly uniformly distributed within the range [0,1). +PL_API void pl_generate_bayer_matrix(float *data, int size); + +// Generates a random NxN blue noise texture. storing the result in `data`. +// `size` must be a positive power of two no larger than 256. The resulting +// texture will be roughly uniformly distributed within the range [0,1). +// +// Note: This function is very, *very* slow for large sizes. Generating a +// dither matrix with size 256 can take several seconds on a modern processor. +PL_API void pl_generate_blue_noise(float *data, int size); + +// Defines the border of all error diffusion kernels +#define PL_EDF_MIN_DX (-2) +#define PL_EDF_MAX_DX (2) +#define PL_EDF_MAX_DY (2) + +struct pl_error_diffusion_kernel { + const char *name; // Short and concise identifier + const char *description; // Longer / friendly name + + // The minimum value such that a (y, x) -> (y, x + y * shift) mapping will + // make all error pushing operations affect next column (and after it) + // only. + // + // Higher shift values are significantly more computationally intensive. + int shift; + + // The diffusion factor for (y, x) is pattern[y][x - PL_EDF_MIN_DX] / divisor. + int pattern[PL_EDF_MAX_DY + 1][PL_EDF_MAX_DX - PL_EDF_MIN_DX + 1]; + int divisor; +}; + +// Algorithms with shift=1: +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_simple; +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_false_fs; +// Algorithms with shift=2: +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra_lite; +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_floyd_steinberg; +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_atkinson; +// Algorithms with shift=3, probably too heavy for low end GPUs: +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_jarvis_judice_ninke; +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_stucki; +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_burkes; +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra2; +PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra3; + +// A list of built-in error diffusion kernels, terminated by NULL +PL_API extern const struct pl_error_diffusion_kernel * const pl_error_diffusion_kernels[]; +PL_API extern const int pl_num_error_diffusion_kernels; // excluding trailing NULL + +// Find the error diffusion kernel with the given name, or NULL on failure. +PL_API const struct pl_error_diffusion_kernel *pl_find_error_diffusion_kernel(const char *name); + +PL_API_END + +#endif // LIBPLACEBO_DITHER_H_ diff --git a/src/include/libplacebo/dummy.h b/src/include/libplacebo/dummy.h new file mode 100644 index 0000000..c298438 --- /dev/null +++ b/src/include/libplacebo/dummy.h @@ -0,0 +1,131 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_DUMMY_H_ +#define LIBPLACEBO_DUMMY_H_ + +#include <libplacebo/gpu.h> + +PL_API_BEGIN + +// The functions in this file allow creating and manipulating "dummy" contexts. +// A dummy context isn't actually mapped by the GPU, all data exists purely on +// the CPU. It also isn't capable of compiling or executing any shaders, any +// attempts to do so will simply fail. +// +// The main use case for this dummy context is for users who want to generate +// advanced shaders that depend on specific GLSL features or support for +// certain types of GPU resources (e.g. LUTs). This dummy context allows such +// shaders to be generated, with all of the referenced shader objects and +// textures simply containing their data in a host-accessible way. + +struct pl_gpu_dummy_params { + // These GPU parameters correspond to their equivalents in `pl_gpu`, and + // must obey the same rules as documented there. The values from + // `pl_gpu_dummy_default_params` are set to support pretty much everything + // and are set for GLSL version 450. + // + // Individual fields such as `glsl.compute` or `glsl.version` description + // can and should be overridden by the user based on their requirements. + // Individual limits should ideally be set based on the corresponding + // `glGet` queries etc. + struct pl_glsl_version glsl; + struct pl_gpu_limits limits; +}; + +#define PL_GPU_DUMMY_DEFAULTS \ + .glsl = { \ + .version = 450, \ + .gles = false, \ + .vulkan = false, \ + .compute = true, \ + .max_shmem_size = SIZE_MAX, \ + .max_group_threads = 1024, \ + .max_group_size = { 1024, 1024, 1024 }, \ + .subgroup_size = 32, \ + .min_gather_offset = INT16_MIN, \ + .max_gather_offset = INT16_MAX, \ + }, \ + .limits = { \ + /* pl_gpu */ \ + .callbacks = false, \ + .thread_safe = true, \ + /* pl_buf */ \ + .max_buf_size = SIZE_MAX, \ + .max_ubo_size = SIZE_MAX, \ + .max_ssbo_size = SIZE_MAX, \ + .max_vbo_size = SIZE_MAX, \ + .max_mapped_size = SIZE_MAX, \ + .max_buffer_texels = UINT64_MAX, \ + /* pl_tex */ \ + .max_tex_1d_dim = UINT32_MAX, \ + .max_tex_2d_dim = UINT32_MAX, \ + .max_tex_3d_dim = UINT32_MAX, \ + .buf_transfer = true, \ + .align_tex_xfer_pitch = 1, \ + .align_tex_xfer_offset = 1, \ + /* pl_pass */ \ + .max_variable_comps = SIZE_MAX, \ + .max_constants = SIZE_MAX, \ + .max_pushc_size = SIZE_MAX, \ + .max_dispatch = { UINT32_MAX, UINT32_MAX, UINT32_MAX }, \ + .fragment_queues = 0, \ + .compute_queues = 0, \ + }, + +#define pl_gpu_dummy_params(...) (&(struct pl_gpu_dummy_params) { PL_GPU_DUMMY_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_gpu_dummy_params pl_gpu_dummy_default_params; + +// Create a dummy GPU context based on the given parameters. This GPU will have +// a format for each host-representable type (i.e. intN_t, floats and doubles), +// in the canonical channel order RGBA. These formats will have every possible +// capability activated, respectively. +// +// If `params` is left as NULL, it defaults to `&pl_gpu_dummy_params`. +PL_API pl_gpu pl_gpu_dummy_create(pl_log log, const struct pl_gpu_dummy_params *params); +PL_API void pl_gpu_dummy_destroy(pl_gpu *gpu); + +// Back-doors into the `pl_tex` and `pl_buf` representations. These allow you +// to access the raw data backing this object. Textures are always laid out in +// a tightly packed manner. +// +// For "placeholder" dummy textures, this always returns NULL. +PL_API uint8_t *pl_buf_dummy_data(pl_buf buf); +PL_API uint8_t *pl_tex_dummy_data(pl_tex tex); + +// Skeleton of `pl_tex_params` containing only the fields relevant to +// `pl_tex_dummy_create`, plus the extra `sampler_type` field. +struct pl_tex_dummy_params { + int w, h, d; + pl_fmt format; + enum pl_sampler_type sampler_type; + void *user_data; +}; + +#define pl_tex_dummy_params(...) (&(struct pl_tex_dummy_params) { __VA_ARGS__ }) + +// Allows creating a "placeholder" dummy texture. This is basically a texture +// that isn't even backed by anything. All `pl_tex_*` operations (other than +// `pl_tex_destroy`) performed on it will simply fail. +// +// All of the permissions will be set to `false`, except `sampleable`, which is +// set to `true`. (So you can use it as an input to shader sampling functions) +PL_API pl_tex pl_tex_dummy_create(pl_gpu gpu, const struct pl_tex_dummy_params *params); + +PL_API_END + +#endif // LIBPLACEBO_DUMMY_H_ diff --git a/src/include/libplacebo/filters.h b/src/include/libplacebo/filters.h new file mode 100644 index 0000000..a95649d --- /dev/null +++ b/src/include/libplacebo/filters.h @@ -0,0 +1,415 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_FILTER_KERNELS_H_ +#define LIBPLACEBO_FILTER_KERNELS_H_ + +#include <stdbool.h> +#include <libplacebo/log.h> + +PL_API_BEGIN + +#define PL_FILTER_MAX_PARAMS 2 + +// Invocation parameters for a given kernel +struct pl_filter_ctx { + float radius; + float params[PL_FILTER_MAX_PARAMS]; +}; + +// Represents a single filter function, i.e. kernel or windowing function. +struct pl_filter_function { + // The cosmetic name associated with this filter function. + const char *name; + + // The radius of the filter function. For resizable filters, this gives + // the radius needed to represent a single filter lobe (tap). + float radius; + + // If true, the filter function is resizable (see pl_filter_config.radius) + bool resizable; + + // If true, the filter function is tunable (see pl_filter_config.params) + bool tunable[PL_FILTER_MAX_PARAMS]; + + // If the relevant parameter is tunable, this contains the default values. + float params[PL_FILTER_MAX_PARAMS]; + + // The underlying filter function itself: Computes the weight as a function + // of the offset. All filter functions must be normalized such that x=0 is + // the center point, and in particular weight(0) = 1.0. The functions may + // be undefined for values of x outside [0, radius]. + double (*weight)(const struct pl_filter_ctx *f, double x); + + // If true, this filter represents an opaque placeholder for a more + // sophisticated filter function which does not fit into the pl_filter + // framework. `weight()` will always return 0.0. + bool opaque; +}; + +// Deprecated function, merely checks a->weight == b->weight +PL_DEPRECATED PL_API bool +pl_filter_function_eq(const struct pl_filter_function *a, + const struct pl_filter_function *b); + +// Box filter: Entirely 1.0 within the radius, entirely 0.0 outside of it. +// This is also sometimes called a Dirichlet window +PL_API extern const struct pl_filter_function pl_filter_function_box; + +// Triangle filter: Linear transitions from 1.0 at x=0 to 0.0 at x=radius. +// This is also sometimes called a Bartlett window. +PL_API extern const struct pl_filter_function pl_filter_function_triangle; + +// Cosine filter: Ordinary cosine function, single lobe. +PL_API extern const struct pl_filter_function pl_filter_function_cosine; + +// Hann function: Cosine filter named after Julius von Hann. Also commonly +// mislabeled as a "Hanning" function, due to its similarly to the Hamming +// function. +PL_API extern const struct pl_filter_function pl_filter_function_hann; + +// Hamming function: Cosine filter named after Richard Hamming. +PL_API extern const struct pl_filter_function pl_filter_function_hamming; + +// Welch filter: Polynomial function consisting of a single parabolic section. +PL_API extern const struct pl_filter_function pl_filter_function_welch; + +// Kaiser filter: Approximation of the DPSS window using Bessel functions. +// Also sometimes called a Kaiser-Bessel window. +// Parameter [0]: Shape (alpha). Determines the trade-off between the main lobe +// and the side lobes. +PL_API extern const struct pl_filter_function pl_filter_function_kaiser; + +// Blackman filter: Cosine filter named after Ralph Beebe Blackman. +// Parameter [0]: Scale (alpha). Influences the shape. The defaults result in +// zeros at the third and fourth sidelobes. +PL_API extern const struct pl_filter_function pl_filter_function_blackman; + +// Bohman filter: 2nd order Cosine filter. +PL_API extern const struct pl_filter_function pl_filter_function_bohman; + +// Gaussian function: Similar to the Gaussian distribution, this defines a +// bell curve function. +// Parameter [0]: Scale (t), increasing makes the result blurrier. +PL_API extern const struct pl_filter_function pl_filter_function_gaussian; + +// Quadratic function: 2nd order approximation of the gaussian function. Also +// sometimes called a "quadric" window. +PL_API extern const struct pl_filter_function pl_filter_function_quadratic; + +// Sinc function: Widely used for both kernels and windows, sinc(x) = sin(x)/x. +PL_API extern const struct pl_filter_function pl_filter_function_sinc; + +// Jinc function: Similar to sinc, but extended to the 2D domain. Widely +// used as the kernel of polar (EWA) filters. Also sometimes called a Sombrero +// function. +PL_API extern const struct pl_filter_function pl_filter_function_jinc; + +// Sphinx function: Similar to sinc and jinx, but extended to the 3D domain. +// The name is derived from "spherical" sinc. Can be used to filter 3D signals +// in theory. +PL_API extern const struct pl_filter_function pl_filter_function_sphinx; + +// B/C-tunable Spline function: This is a family of commonly used spline +// functions with two tunable parameters. Does not need to be windowed. +// Parameter [0]: "B" +// Parameter [1]: "C" +// Some popular variants of this function are: +// B = 1.0, C = 0.0: "base" Cubic (blurry) +// B = 0.0, C = 0.0: Hermite filter (blocky) +// B = 0.0, C = 0.5: Catmull-Rom filter (sharp) +// B = 1/3, C = 1/3: Mitchell-Netravali filter (soft, doesn't ring) +// B ≈ 0.37, C ≈ 0.31: Robidoux filter (used by ImageMagick) +// B ≈ 0.26, C ≈ 0.37: RobidouxSharp filter (sharper variant of Robidoux) +PL_API extern const struct pl_filter_function pl_filter_function_cubic; +PL_API extern const struct pl_filter_function pl_filter_function_hermite; +#define pl_filter_function_bicubic pl_filter_function_cubic +#define pl_filter_function_bcspline pl_filter_function_cubic + +// Cubic splines with 2/3/4 taps. Referred to as "spline16", "spline36", and +// "spline64" mainly for historical reasons, based on the number of pixels in +// their window when using them as 2D orthogonal filters. Do not need to be +// windowed. +PL_API extern const struct pl_filter_function pl_filter_function_spline16; +PL_API extern const struct pl_filter_function pl_filter_function_spline36; +PL_API extern const struct pl_filter_function pl_filter_function_spline64; + +// Special filter function for the built-in oversampling algorithm. This is an +// opaque filter with no meaningful representation. though it has one tunable +// parameter controlling the threshold at which to switch back to ordinary +// nearest neighbour sampling. (See `pl_shader_sample_oversample`) +PL_API extern const struct pl_filter_function pl_filter_function_oversample; + +// A list of built-in filter functions, terminated by NULL +// +// Note: May contain extra aliases for the above functions. +PL_API extern const struct pl_filter_function * const pl_filter_functions[]; +PL_API extern const int pl_num_filter_functions; // excluding trailing NULL + +// Find the filter function with the given name, or NULL on failure. +PL_API const struct pl_filter_function *pl_find_filter_function(const char *name); + +// Backwards compatibility with the older configuration API. Redundant with +// `pl_filter_function.name`. May be formally deprecated in the future. + +struct pl_filter_function_preset { + const char *name; + const struct pl_filter_function *function; +}; + +// A list of built-in filter function presets, terminated by {0} +PL_API extern const struct pl_filter_function_preset pl_filter_function_presets[]; +PL_API extern const int pl_num_filter_function_presets; // excluding trailing {0} + +// Find the filter function preset with the given name, or NULL on failure. +PL_API const struct pl_filter_function_preset *pl_find_filter_function_preset(const char *name); + +// Different usage domains for a filter +enum pl_filter_usage { + PL_FILTER_UPSCALING = (1 << 0), + PL_FILTER_DOWNSCALING = (1 << 1), + PL_FILTER_FRAME_MIXING = (1 << 2), + + PL_FILTER_SCALING = PL_FILTER_UPSCALING | PL_FILTER_DOWNSCALING, + PL_FILTER_ALL = PL_FILTER_SCALING | PL_FILTER_FRAME_MIXING, +}; + +// Represents a tuned combination of filter functions, plus parameters +struct pl_filter_config { + // The cosmetic name associated with this filter config. Optional for + // user-provided configs, but always set by built-in configurations. + const char *name; + + // Longer / friendly name. Always set for built-in configurations, + // except for names which are merely aliases of other filters. + const char *description; + + // Allowed and recommended usage domains (respectively) + // + // When it is desired to maintain a simpler user interface, it may be + // recommended to include only scalers whose recommended usage domains + // includes the relevant context in which it will be used. + enum pl_filter_usage allowed; + enum pl_filter_usage recommended; + + // The kernel function and (optionally) windowing function. + const struct pl_filter_function *kernel; + const struct pl_filter_function *window; + + // The radius. Ignored if !kernel->resizable. Optional, defaults to + // kernel->radius if unset. + float radius; + + // Parameters for the respective filter function. Ignored if not tunable. + float params[PL_FILTER_MAX_PARAMS]; + float wparams[PL_FILTER_MAX_PARAMS]; + + // Represents a clamping coefficient for negative weights. A value of 0.0 + // (the default) represents no clamping. A value of 1.0 represents full + // clamping, i.e. all negative weights will be clamped to 0. Values in + // between will be linearly scaled. + float clamp; + + // Additional blur coefficient. This effectively stretches the kernel, + // without changing the effective radius of the filter radius. Setting this + // to a value of 0.0 is equivalent to disabling it. Values significantly + // below 1.0 may seriously degrade the visual output, and should be used + // with care. + float blur; + + // Additional taper coefficient. This essentially flattens the function's + // center. The values within [-taper, taper] will return 1.0, with the + // actual function being squished into the remainder of [taper, radius]. + // Defaults to 0.0. + float taper; + + // If true, this filter is intended to be used as a polar/2D filter (EWA) + // instead of a separable/1D filter. Does not affect the actual sampling, + // but provides information about how the results are to be interpreted. + bool polar; + + // Antiringing strength. A value of 0.0 disables antiringing, and a value + // of 1.0 enables full-strength antiringing. Defaults to 0.0 if + // unspecified. + // + // Note: This is only included in `pl_filter_config` for convenience. Does + // not affect the actual filter sampling, but provides information to the + // downstream consumer of the `pl_filter`. + float antiring; +}; + +PL_API bool pl_filter_config_eq(const struct pl_filter_config *a, + const struct pl_filter_config *b); + +// Samples a given filter configuration at a given x coordinate, while +// respecting all parameters of the configuration. +PL_API double pl_filter_sample(const struct pl_filter_config *c, double x); + +// A list of built-in filter configurations. Since they are just combinations +// of the above filter functions, they are not described in much further +// detail. +PL_API extern const struct pl_filter_config pl_filter_spline16; // 2 taps +PL_API extern const struct pl_filter_config pl_filter_spline36; // 3 taps +PL_API extern const struct pl_filter_config pl_filter_spline64; // 4 taps +PL_API extern const struct pl_filter_config pl_filter_nearest; +PL_API extern const struct pl_filter_config pl_filter_box; +PL_API extern const struct pl_filter_config pl_filter_bilinear; +PL_API extern const struct pl_filter_config pl_filter_gaussian; +// Sinc family (all configured to 3 taps): +PL_API extern const struct pl_filter_config pl_filter_sinc; // unwindowed +PL_API extern const struct pl_filter_config pl_filter_lanczos; // sinc-sinc +PL_API extern const struct pl_filter_config pl_filter_ginseng; // sinc-jinc +PL_API extern const struct pl_filter_config pl_filter_ewa_jinc; // unwindowed +PL_API extern const struct pl_filter_config pl_filter_ewa_lanczos; // jinc-jinc +PL_API extern const struct pl_filter_config pl_filter_ewa_lanczossharp; +PL_API extern const struct pl_filter_config pl_filter_ewa_lanczos4sharpest; +PL_API extern const struct pl_filter_config pl_filter_ewa_ginseng; // jinc-sinc +PL_API extern const struct pl_filter_config pl_filter_ewa_hann; // jinc-hann +// Spline family +PL_API extern const struct pl_filter_config pl_filter_bicubic; +PL_API extern const struct pl_filter_config pl_filter_hermite; +PL_API extern const struct pl_filter_config pl_filter_catmull_rom; +PL_API extern const struct pl_filter_config pl_filter_mitchell; +PL_API extern const struct pl_filter_config pl_filter_mitchell_clamp; // clamp = 1.0 +PL_API extern const struct pl_filter_config pl_filter_robidoux; +PL_API extern const struct pl_filter_config pl_filter_robidouxsharp; +PL_API extern const struct pl_filter_config pl_filter_ewa_robidoux; +PL_API extern const struct pl_filter_config pl_filter_ewa_robidouxsharp; +// Special/opaque filters +PL_API extern const struct pl_filter_config pl_filter_oversample; + +// Backwards compatibility +#define pl_filter_triangle pl_filter_bilinear +#define pl_oversample_frame_mixer pl_filter_oversample + +// A list of built-in filter configs, terminated by NULL +PL_API extern const struct pl_filter_config * const pl_filter_configs[]; +PL_API extern const int pl_num_filter_configs; // excluding trailing NULL + +// Find the filter config with the given name, or NULL on failure. +// `usage` restricts the valid usage (based on `pl_filter_config.allowed`). +PL_API const struct pl_filter_config * +pl_find_filter_config(const char *name, enum pl_filter_usage usage); + +// Backward compatibility with the previous filter configuration API. Redundant +// with pl_filter_config.name/description. May be deprecated in the future. +struct pl_filter_preset { + const char *name; + const struct pl_filter_config *filter; + + // Longer / friendly name, or NULL for aliases + const char *description; +}; + +// A list of built-in filter presets, terminated by {0} +PL_API extern const struct pl_filter_preset pl_filter_presets[]; +PL_API extern const int pl_num_filter_presets; // excluding trailing {0} + +// Find the filter preset with the given name, or NULL on failure. +PL_API const struct pl_filter_preset *pl_find_filter_preset(const char *name); + +// Parameters for filter generation. +struct pl_filter_params { + // The particular filter configuration to be sampled. config.kernel must + // be set to a valid pl_filter_function. + struct pl_filter_config config; + + // The precision of the resulting LUT. A value of 64 should be fine for + // most practical purposes, but higher or lower values may be justified + // depending on the use case. This value must be set to something > 0. + int lut_entries; + + // --- Polar filers only (config.polar) + + // As a micro-optimization, all samples below this cutoff value will be + // ignored when updating the cutoff radius. Setting it to a value of 0.0 + // disables this optimization. + float cutoff; + + // --- Separable filters only (!config.polar) + + // Indicates the maximum row size that is supported by the calling code, or + // 0 for no limit. + int max_row_size; + + // Indicates the row stride alignment. For some use cases (e.g. uploading + // the weights as a texture), there are certain alignment requirements for + // each row. The chosen row_size will always be a multiple of this value. + // Specifying 0 indicates no alignment requirements. + int row_stride_align; + + // --- Deprecated options + float filter_scale PL_DEPRECATED; // no effect, use `config.blur` instead +}; + +#define pl_filter_params(...) (&(struct pl_filter_params) { __VA_ARGS__ }) + +// Represents an initialized instance of a particular filter, with a +// precomputed LUT. The interpretation of the LUT depends on the type of the +// filter (polar or separable). +typedef const struct pl_filter_t { + // Deep copy of the parameters, for convenience. + struct pl_filter_params params; + + // Contains the true radius of the computed filter. This may be + // smaller than the configured radius depending on the exact filter + // parameters used. Mainly relevant for polar filters, since + // it affects the value range of *weights. + float radius; + + // Radius of the first zero crossing (main lobe size). + float radius_zero; + + // The computed look-up table (LUT). For polar filters, this is interpreted + // as a 1D array with dimensions [lut_entries] containing the raw filter + // samples on the scale [0, radius]. For separable (non-polar) filters, + // this is interpreted as a 2D array with dimensions + // [lut_entries][row_stride]. The inner rows contain the `row_size` samples + // to convolve with the corresponding input pixels. The outer coordinate is + // used to very the fractional offset (phase). So for example, if the + // sample position to reconstruct is directly aligned with the source + // texels, you would use the values from weights[0]. If the sample position + // to reconstruct is exactly half-way between two source texels (180° out + // of phase), you would use the values from weights[lut_entries/2]. + const float *weights; + + // --- separable filters only (!params.config.polar) + + // The number of source texels to convolve over for each row. This value + // will never exceed the given `max_row_size`. If the filter ends up + // cut off because of this, the bool `insufficient` will be set to true. + int row_size; + bool insufficient; + + // The separation (in *weights) between each row of the filter. Always + // a multiple of params.row_stride_align. + int row_stride; + + // --- deprecated / removed fields + float radius_cutoff PL_DEPRECATED; // identical to `radius` +} *pl_filter; + +// Generate (compute) a filter instance based on a given filter configuration. +// The resulting pl_filter must be freed with `pl_filter_free` when no longer +// needed. Returns NULL if filter generation fails due to invalid parameters +// (i.e. missing a required parameter). +PL_API pl_filter pl_filter_generate(pl_log log, const struct pl_filter_params *params); +PL_API void pl_filter_free(pl_filter *filter); + +PL_API_END + +#endif // LIBPLACEBO_FILTER_KERNELS_H_ diff --git a/src/include/libplacebo/gamut_mapping.h b/src/include/libplacebo/gamut_mapping.h new file mode 100644 index 0000000..a92a73b --- /dev/null +++ b/src/include/libplacebo/gamut_mapping.h @@ -0,0 +1,182 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_GAMUT_MAPPING_H_ +#define LIBPLACEBO_GAMUT_MAPPING_H_ + +#include <libplacebo/common.h> +#include <libplacebo/colorspace.h> + +PL_API_BEGIN + +struct pl_gamut_map_params; +struct pl_gamut_map_function { + const char *name; // Identifier + const char *description; // Friendly / longer name + + // The gamut-mapping function itself. Iterates over all values in `lut`, + // and adapts them as needed. + void (*map)(float *lut, const struct pl_gamut_map_params *params); + + // Returns true if `map` supports both stretching and contracting the + // gamut. In this case, `map` is always executed, even if the output gamut + // is larger than the input gamut. + bool bidirectional; + + // Private data. Unused by libplacebo, but may be accessed by `map`. + void *priv; +}; + +struct pl_gamut_map_constants { + // (Relative) chromaticity protection zone for perceptual mapping [0,1] + float perceptual_deadzone; + + // Strength of the perceptual saturation mapping component [0,1] + float perceptual_strength; + + // I vs C curve gamma to use for colorimetric clipping [0,10] + float colorimetric_gamma; + + // Knee point to use for softclipping methods (perceptual, softclip) [0,1] + float softclip_knee; + + // Desaturation strength (for softclip only) [0,1] + float softclip_desat; +}; + +#define PL_GAMUT_MAP_CONSTANTS \ + .colorimetric_gamma = 1.80f, \ + .softclip_knee = 0.70f, \ + .softclip_desat = 0.35f, \ + .perceptual_deadzone = 0.30f, \ + .perceptual_strength = 0.80f, + +struct pl_gamut_map_params { + // If `function` is NULL, defaults to `pl_gamut_map_clip`. + const struct pl_gamut_map_function *function; + + // The desired input/output primaries. This affects the subjective color + // volume in which the desired mapping shall take place. + struct pl_raw_primaries input_gamut; + struct pl_raw_primaries output_gamut; + + // Minimum/maximum luminance (PQ) of the target display. Note that the same + // value applies to both the input and output, since it's assumed that tone + // mapping has already happened by this stage. This effectively defines the + // legal gamut boundary in RGB space. + // + // This also defines the I channel value range, for `pl_gamut_map_generate` + float min_luma; + float max_luma; + + // Common constants, should be initialized to PL_GAMUT_MAP_CONSTANTS if + // not intending to override them further. + struct pl_gamut_map_constants constants; + + // -- LUT generation options (for `pl_gamut_map_generate` only) + + // The size of the resulting LUT, per channel. + // + // Note: For quality, it's generally best to increase h > I > C + int lut_size_I; + int lut_size_C; + int lut_size_h; + + // The stride (in number of floats) between elements in the resulting LUT. + int lut_stride; + + // -- Removed parameters + float chroma_margin PL_DEPRECATED; // non-functional +}; + +#define pl_gamut_map_params(...) (&(struct pl_gamut_map_params) { \ + .constants = { PL_GAMUT_MAP_CONSTANTS }, \ + __VA_ARGS__ \ +}) + +// Note: Only does pointer equality testing on `function` +PL_API bool pl_gamut_map_params_equal(const struct pl_gamut_map_params *a, + const struct pl_gamut_map_params *b); + +// Returns true if the given gamut mapping configuration effectively represents +// a no-op configuration. Gamut mapping can be skipped in this case. +PL_API bool pl_gamut_map_params_noop(const struct pl_gamut_map_params *params); + +// Generate a gamut-mapping LUT for a given configuration. LUT samples are +// stored as IPTPQc4 values, but the LUT itself is indexed by IChPQc4,spanning +// the effective range [min_luma, max_luma] × [0, 0.5] × [-pi,pi]. +// +// This ordering is designed to keep frequently co-occurring values close in +// memory, while permitting simple wrapping of the 'h' component. +PL_API void pl_gamut_map_generate(float *out, const struct pl_gamut_map_params *params); + +// Samples a gamut mapping function for a single IPTPQc4 value. The input +// values are updated in-place. +PL_API void pl_gamut_map_sample(float x[3], const struct pl_gamut_map_params *params); + +// Performs no gamut-mapping, just hard clips out-of-range colors per-channel. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_clip; + +// Performs a perceptually balanced (saturation) gamut mapping, using a soft +// knee function to preserve in-gamut colors, followed by a final softclip +// operation. This works bidirectionally, meaning it can both compress and +// expand the gamut. Behaves similar to a blend of `saturation` and `softclip`. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_perceptual; + +// Performs a perceptually balanced gamut mapping using a soft knee function to +// roll-off clipped regions, and a hue shifting function to preserve saturation. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_softclip; + +// Performs relative colorimetric clipping, while maintaining an exponential +// relationship between brightness and chromaticity. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_relative; + +// Performs simple RGB->RGB saturation mapping. The input R/G/B channels are +// mapped directly onto the output R/G/B channels. Will never clip, but will +// distort all hues and/or result in a faded look. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_saturation; + +// Performs absolute colorimetric clipping. Like pl_gamut_map_relative, but +// does not adapt the white point. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_absolute; + +// Performs constant-luminance colorimetric clipping, desaturing colors +// towards white until they're in-range. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_desaturate; + +// Uniformly darkens the input slightly to prevent clipping on blown-out +// highlights, then clamps colorimetrically to the input gamut boundary, +// biased slightly to preserve chromaticity over luminance. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_darken; + +// Performs no gamut mapping, but simply highlights out-of-gamut pixels. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_highlight; + +// Linearly/uniformly desaturates the image in order to bring the entire +// image into the target gamut. +PL_API extern const struct pl_gamut_map_function pl_gamut_map_linear; + +// A list of built-in gamut mapping functions, terminated by NULL +PL_API extern const struct pl_gamut_map_function * const pl_gamut_map_functions[]; +PL_API extern const int pl_num_gamut_map_functions; // excluding trailing NULL + +// Find the gamut mapping function with the given name, or NULL on failure. +PL_API const struct pl_gamut_map_function *pl_find_gamut_map_function(const char *name); + +PL_API_END + +#endif // LIBPLACEBO_GAMUT_MAPPING_H_ diff --git a/src/include/libplacebo/gpu.h b/src/include/libplacebo/gpu.h new file mode 100644 index 0000000..a63fdf7 --- /dev/null +++ b/src/include/libplacebo/gpu.h @@ -0,0 +1,1464 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_GPU_H_ +#define LIBPLACEBO_GPU_H_ + +#include <stddef.h> +#include <stdbool.h> +#include <stdint.h> + +#include <libplacebo/common.h> +#include <libplacebo/cache.h> +#include <libplacebo/log.h> + +PL_API_BEGIN + +// These are not memory managed, and should represent compile-time constants +typedef const char *pl_debug_tag; +#define PL_DEBUG_TAG (__FILE__ ":" PL_TOSTRING(__LINE__)) + +// Type of a shader input descriptor. +enum pl_desc_type { + PL_DESC_INVALID = 0, + PL_DESC_SAMPLED_TEX, // C: pl_tex* GLSL: combined texture sampler + // (`pl_tex->params.sampleable` must be set) + PL_DESC_STORAGE_IMG, // C: pl_tex* GLSL: storage image + // (`pl_tex->params.storable` must be set) + PL_DESC_BUF_UNIFORM, // C: pl_buf* GLSL: uniform buffer + // (`pl_buf->params.uniform` must be set) + PL_DESC_BUF_STORAGE, // C: pl_buf* GLSL: storage buffer + // (`pl_buf->params.storable` must be set) + PL_DESC_BUF_TEXEL_UNIFORM,// C: pl_buf* GLSL: uniform samplerBuffer + // (`pl_buf->params.uniform` and `format` must be set) + PL_DESC_BUF_TEXEL_STORAGE,// C: pl_buf* GLSL: uniform imageBuffer + // (`pl_buf->params.uniform` and `format` must be set) + PL_DESC_TYPE_COUNT +}; + +// This file contains the definition of an API which is designed to abstract +// away from platform-specific APIs like the various OpenGL variants, Direct3D +// and Vulkan in a common way. It is a much more limited API than those APIs, +// since it tries targeting a very small common subset of features that is +// needed to implement libplacebo's rendering. +// +// NOTE: Most, but not all, parameter conditions (phrases such as "must" or +// "valid usage" are explicitly tested and result in error messages followed by +// graceful failure. Exceptions are noted where they exist. + +// Structure which wraps metadata describing GLSL capabilities. +struct pl_glsl_version { + int version; // GLSL version (e.g. 450), for #version + bool gles; // GLSL ES semantics (ESSL) + bool vulkan; // GL_KHR_vulkan_glsl semantics + + // Compute shader support and limits. If `compute` is false, then all + // of the remaining fields in this section are {0}. + bool compute; + size_t max_shmem_size; // maximum compute shader shared memory size + uint32_t max_group_threads; // maximum number of local threads per work group + uint32_t max_group_size[3]; // maximum work group size per dimension + + // If nonzero, signals availability of shader subgroups. This guarantess + // availability of all of the following extensions: + // - GL_KHR_shader_subgroup_basic + // - GL_KHR_shader_subgroup_vote + // - GL_KHR_shader_subgroup_arithmetic + // - GL_KHR_shader_subgroup_ballot + // - GL_KHR_shader_subgroup_shuffle + uint32_t subgroup_size; + + // Miscellaneous shader limits + int16_t min_gather_offset; // minimum `textureGatherOffset` offset + int16_t max_gather_offset; // maximum `textureGatherOffset` offset +}; + +// Backwards compatibility alias +#define pl_glsl_desc pl_glsl_version + +// Structure defining the physical limits and capabilities of this GPU +// instance. If a limit is given as 0, that means that feature is unsupported. +struct pl_gpu_limits { + // --- pl_gpu + bool thread_safe; // `pl_gpu` calls are thread-safe + bool callbacks; // supports asynchronous GPU callbacks + + // --- pl_buf + size_t max_buf_size; // maximum size of any buffer + size_t max_ubo_size; // maximum size of a `uniform` buffer + size_t max_ssbo_size; // maximum size of a `storable` buffer + size_t max_vbo_size; // maximum size of a `drawable` buffer + size_t max_mapped_size; // maximum size of a `host_mapped` buffer + uint64_t max_buffer_texels; // maximum number of texels in a texel buffer + bool host_cached; // if true, PL_BUF_MEM_HOST buffers are cached + + // Required alignment for PL_HANDLE_HOST_PTR imports. This is provided + // merely as a hint to the user. If the host pointer being imported is + // misaligned, libplacebo will internally round (over-map) the region. + size_t align_host_ptr; + + // --- pl_tex + uint32_t max_tex_1d_dim; // maximum width for a 1D texture + uint32_t max_tex_2d_dim; // maximum width/height for a 2D texture (required) + uint32_t max_tex_3d_dim; // maximum width/height/depth for a 3D texture + bool blittable_1d_3d; // supports blittable 1D/3D textures + bool buf_transfer; // supports `pl_tex_transfer_params.buf` + + // These don't represent hard limits but indicate performance hints for + // optimal alignment. For best performance, the corresponding field + // should be aligned to a multiple of these. They will always be a power + // of two. + size_t align_tex_xfer_pitch; // optimal `pl_tex_transfer_params.row_pitch` + size_t align_tex_xfer_offset; // optimal `pl_tex_transfer_params.buf_offset` + + // --- pl_pass + size_t max_variable_comps; // maximum components passed in variables + size_t max_constants; // maximum `pl_pass_params.num_constants` + bool array_size_constants; // push constants can be used to size arrays + size_t max_pushc_size; // maximum `push_constants_size` + size_t align_vertex_stride; // alignment of `pl_pass_params.vertex_stride` + uint32_t max_dispatch[3]; // maximum dispatch size per dimension + + // Note: At least one of `max_variable_comps` or `max_ubo_size` is + // guaranteed to be nonzero. + + // As a performance hint, the GPU may signal the number of command queues + // it has for fragment and compute shaders, respectively. Users may use + // this information to decide the appropriate type of shader to dispatch. + uint32_t fragment_queues; + uint32_t compute_queues; +}; + +// Backwards compatibility aliases +#define max_xfer_size max_buf_size +#define align_tex_xfer_stride align_tex_xfer_pitch + +// Some `pl_gpu` operations allow sharing GPU resources with external APIs - +// examples include interop with other graphics APIs such as CUDA, and also +// various hardware decoding APIs. This defines the mechanism underpinning the +// communication of such an interoperation. +typedef uint64_t pl_handle_caps; +enum pl_handle_type { + PL_HANDLE_FD = (1 << 0), // `int fd` for POSIX-style APIs + PL_HANDLE_WIN32 = (1 << 1), // `HANDLE` for win32 API + PL_HANDLE_WIN32_KMT = (1 << 2), // `HANDLE` for pre-Windows-8 win32 API + PL_HANDLE_DMA_BUF = (1 << 3), // 'int fd' for a dma_buf fd + PL_HANDLE_HOST_PTR = (1 << 4), // `void *` for a host-allocated pointer + PL_HANDLE_MTL_TEX = (1 << 5), // `MTLTexture*` for Apple platforms + PL_HANDLE_IOSURFACE = (1 << 6), // `IOSurfaceRef` for Apple platforms +}; + +struct pl_gpu_handle_caps { + pl_handle_caps tex; // supported handles for `pl_tex` + `pl_shared_mem` + pl_handle_caps buf; // supported handles for `pl_buf` + `pl_shared_mem` + pl_handle_caps sync; // supported handles for `pl_sync` / semaphores +}; + +// Wrapper for the handle used to communicate a shared resource externally. +// This handle is owned by the `pl_gpu` - if a user wishes to use it in a way +// that takes over ownership (e.g. importing into some APIs), they must clone +// the handle before doing so (e.g. using `dup` for fds). It is important to +// read the external API documentation _very_ carefully as different handle +// types may be managed in different ways. (eg: CUDA takes ownership of an fd, +// but does not take ownership of a win32 handle). +union pl_handle { + int fd; // PL_HANDLE_FD / PL_HANDLE_DMA_BUF + void *handle; // PL_HANDLE_WIN32 / PL_HANDLE_WIN32_KMT / PL_HANDLE_MTL_TEX / PL_HANDLE_IOSURFACE + void *ptr; // PL_HANDLE_HOST_PTR +}; + +// Structure encapsulating memory that is shared between libplacebo and the +// user. This memory can be imported into external APIs using the handle. +// +// If the object a `pl_shared_mem` belongs to is destroyed (e.g. via +// `pl_buf_destroy`), the handle becomes undefined, as do the contents of the +// memory it points to, as well as any external API objects imported from it. +struct pl_shared_mem { + union pl_handle handle; + size_t size; // the total size of the memory referenced by this handle + size_t offset; // the offset of the object within the referenced memory + + // Note: `size` is optional for some APIs and handle types, in particular + // when importing DMABUFs or D3D11 textures. + + // For PL_HANDLE_DMA_BUF, this specifies the DRM format modifier that + // describes this resource. Note that when importing `pl_buf`, this must + // be DRM_FORMAT_MOD_LINEAR. For importing `pl_tex`, it can be any + // format modifier supported by the implementation. + uint64_t drm_format_mod; + + // When importing a `pl_tex` of type PL_HANDLE_DMA_BUF, this can be used to + // set the image stride (AKA pitch) in memory. If left as 0, defaults to + // the image width/height. + size_t stride_w; + size_t stride_h; + + // When importing a `pl_tex` of type PL_HANDLE_MTL_TEX, this determines + // which plane is imported (0 - 2). + unsigned plane; +}; + +// Structure grouping PCI bus address fields for GPU devices +struct pl_gpu_pci_address { + uint32_t domain; + uint32_t bus; + uint32_t device; + uint32_t function; +}; + +typedef const struct pl_fmt_t *pl_fmt; + +// Abstract device context which wraps an underlying graphics context and can +// be used to dispatch rendering commands. +// +// Thread-safety: Depends on `pl_gpu_limits.thread_safe` +typedef const struct pl_gpu_t { + pl_log log; + + struct pl_glsl_version glsl; // GLSL features supported by this GPU + struct pl_gpu_limits limits; // physical device limits and capabilities + + // Fields relevant to external API interop. If the underlying device does + // not support interop with other APIs, these will all be {0}. + struct pl_gpu_handle_caps export_caps; // supported handles for exporting + struct pl_gpu_handle_caps import_caps; // supported handles for importing + uint8_t uuid[16]; // underlying device UUID + + // Supported texture formats, in preference order. (If there are multiple + // similar formats, the "better" ones come first) + pl_fmt *formats; + int num_formats; + + // PCI Bus address of the underlying device, to help with interop. + // This will only be filled in if interop is supported. + struct pl_gpu_pci_address pci; +} *pl_gpu; + +// Attach a pl_cache object to this GPU instance. This cache will be +// used to cache all compiled shaders, as well as several other shader objects +// (e.g. cached 3DLUTs). Calling this with `cache = NULL` disables the cache. +// +// Note: Calling this after shaders have already been compiled will not +// retroactively add those shaders to the cache, so it's recommended to set +// this early, before creating any passes. +PL_API void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache); + +enum pl_fmt_type { + PL_FMT_UNKNOWN = 0, // also used for inconsistent multi-component formats + PL_FMT_UNORM, // unsigned, normalized integer format (sampled as float) + PL_FMT_SNORM, // signed, normalized integer format (sampled as float) + PL_FMT_UINT, // unsigned integer format (sampled as integer) + PL_FMT_SINT, // signed integer format (sampled as integer) + PL_FMT_FLOAT, // (signed) float formats, any bit size + PL_FMT_TYPE_COUNT, +}; + +enum pl_fmt_caps { + PL_FMT_CAP_SAMPLEABLE = 1 << 0, // may be sampled from (PL_DESC_SAMPLED_TEX) + PL_FMT_CAP_STORABLE = 1 << 1, // may be used as storage image (PL_DESC_STORAGE_IMG) + PL_FMT_CAP_LINEAR = 1 << 2, // may be linearly samplied from (PL_TEX_SAMPLE_LINEAR) + PL_FMT_CAP_RENDERABLE = 1 << 3, // may be rendered to (pl_pass_params.target_fmt) + PL_FMT_CAP_BLENDABLE = 1 << 4, // may be blended to (pl_pass_params.enable_blend) + PL_FMT_CAP_BLITTABLE = 1 << 5, // may be blitted from/to (pl_tex_blit) + PL_FMT_CAP_VERTEX = 1 << 6, // may be used as a vertex attribute + PL_FMT_CAP_TEXEL_UNIFORM = 1 << 7, // may be used as a texel uniform buffer + PL_FMT_CAP_TEXEL_STORAGE = 1 << 8, // may be used as a texel storage buffer + PL_FMT_CAP_HOST_READABLE = 1 << 9, // may be used with `host_readable` textures + PL_FMT_CAP_READWRITE = 1 << 10, // may be used with PL_DESC_ACCESS_READWRITE + + // Notes: + // - PL_FMT_CAP_LINEAR also implies PL_FMT_CAP_SAMPLEABLE + // - PL_FMT_CAP_STORABLE also implies `pl_gpu.glsl.compute` + // - PL_FMT_CAP_BLENDABLE implies PL_FMT_CAP_RENDERABLE + // - PL_FMT_CAP_VERTEX implies that the format is non-opaque + // - PL_FMT_CAP_HOST_READABLE implies that the format is non-opaque +}; + +struct pl_fmt_plane { + // Underlying format of this particular sub-plane. This describes the + // components, texel size and host representation for the purpose of + // e.g. transfers, blits, and sampling. + pl_fmt format; + + // X/Y subsampling shift factor for this plane. + uint8_t shift_x, shift_y; +}; + +// Structure describing a texel/vertex format. +struct pl_fmt_t { + const char *name; // symbolic name for this format (e.g. rgba32f) + uint64_t signature; // unique but stable signature (for pass reusability) + + enum pl_fmt_type type; // the format's data type and interpretation + enum pl_fmt_caps caps; // the features supported by this format + int num_components; // number of components for this format + int component_depth[4]; // meaningful bits per component, texture precision + size_t internal_size; // internal texel size (for blit compatibility) + + // For planar formats, this provides a description of each sub-plane. + // + // Note on planar formats: Planar formats are always opaque and typically + // support only a limit subset of capabilities (or none at all). Access + // should be done via sub-planes. (See `pl_tex.planes`) + struct pl_fmt_plane planes[4]; + int num_planes; // or 0 for non-planar textures + + // This controls the relationship between the data as seen by the host and + // the way it's interpreted by the texture. The host representation is + // always tightly packed (no padding bits in between each component). + // + // This representation assumes little endian ordering, i.e. components + // being ordered from LSB to MSB in memory. Note that for oddly packed + // formats like rgb10a2 or rgb565, this is inconsistent with the naming. + // (That is to say, rgb565 has sample order {2, 1, 0} under this convention + // - because rgb565 treats the R channel as the *most* significant bits) + // + // If `opaque` is true, then there's no meaningful correspondence between + // the two, and all of the remaining fields in this section are unset. + // + // If `emulated` is true, then this format doesn't actually exist on the + // GPU as an uploadable texture format - and any apparent support is being + // emulated (typically using compute shaders in the upload path). + bool opaque; + bool emulated; + size_t texel_size; // total size in bytes per texel + size_t texel_align; // texel alignment requirements (bytes) + int host_bits[4]; // number of meaningful bits in host memory + int sample_order[4]; // sampled index for each component, e.g. + // {2, 1, 0, 3} for BGRA textures + + // For sampleable formats, this bool indicates whether or not the format + // is compatible with `textureGather()` + bool gatherable; + + // If usable as a vertex or texel buffer format, this gives the GLSL type + // corresponding to the data. (e.g. vec4) + const char *glsl_type; + + // If usable as a storage image or texel storage buffer + // (PL_FMT_CAP_STORABLE / PL_FMT_CAP_TEXEL_STORAGE), this gives the GLSL + // texel format corresponding to the format (e.g. rgba16ui), if any. This + // field may be NULL, in which case the format modifier may be left + // unspecified. + const char *glsl_format; + + // If available, this gives the fourcc associated with the host + // representation. In particular, this is intended for use with + // PL_HANDLE_DMA_BUF, where this field will match the DRM format from + // <drm_fourcc.h>. May be 0, for formats without matching DRM fourcc. + uint32_t fourcc; + + // If `fourcc` is set, this contains the list of supported drm format + // modifiers for this format. + const uint64_t *modifiers; + int num_modifiers; +}; + +// Returns whether or not a pl_fmt's components are ordered sequentially +// in memory in the order RGBA. +PL_API bool pl_fmt_is_ordered(pl_fmt fmt); + +// Returns whether or not a pl_fmt is sampled as a float (e.g. UNORM) +PL_API bool pl_fmt_is_float(pl_fmt fmt); + +// Returns whether or not a pl_fmt supports a given DRM modifier. +PL_API bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier); + +// Helper function to find a format with a given number of components and +// minimum effective precision per component. If `host_bits` is set, then the +// format will always be non-opaque, unpadded, ordered and have exactly this +// bit depth for each component. Finally, all `caps` must be supported. +PL_API pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components, + int min_depth, int host_bits, enum pl_fmt_caps caps); + +// Finds a vertex format for a given configuration. The resulting vertex will +// have a component depth equivalent to the sizeof() the equivalent host type. +// (e.g. PL_FMT_FLOAT will always have sizeof(float)) +PL_API pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components); + +// Find a format based on its name. +PL_API pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name); + +// Find a format based on its fourcc. +PL_API pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc); + +// A generic 'timer query' object. These can be used to measure an +// approximation of the GPU execution time of a given operation. Due to the +// highly asynchronous nature of GPUs, the actual results of any individual +// timer query may be delayed by quite a bit. As such, users should avoid +// trying to pair any particular GPU command with any particular timer query +// result, and only reuse `pl_timer` objects with identical operations. The +// results of timer queries are guaranteed to be in-order, but individual +// queries may be dropped, and some operations might not record timer results +// at all. (For example, if the underlying hardware does not support timer +// queries for a given operation type) +// +// Thread-safety: Unsafe +typedef struct pl_timer_t *pl_timer; + +// Creates a new timer object. This may return NULL, for example if the +// implementation does not support timers, but since passing NULL to +// `pl_timer_destroy` and `pl_timer_query` is safe, users generally need not +// concern themselves with handling this. +PL_API pl_timer pl_timer_create(pl_gpu gpu); +PL_API void pl_timer_destroy(pl_gpu gpu, pl_timer *); + +// Queries any results that have been measured since the last execution of +// `pl_timer_query`. There may be more than one result, in which case the user +// should simply call the function again to get the subsequent values. This +// function returns a value of 0 in the event that there are no more +// unprocessed results. +// +// The results are reported in nanoseconds, but the actual precision of the +// timestamp queries may be significantly lower. +// +// Note: Results do not queue up indefinitely. Generally, the implementation +// will only keep track of a small, fixed number of results internally. Make +// sure to include this function as part of your main rendering loop to process +// all of its results, or older results will be overwritten by newer ones. +PL_API uint64_t pl_timer_query(pl_gpu gpu, pl_timer); + +enum pl_buf_mem_type { + PL_BUF_MEM_AUTO = 0, // use whatever seems most appropriate + PL_BUF_MEM_HOST, // try allocating from host memory (RAM) + PL_BUF_MEM_DEVICE, // try allocating from device memory (VRAM) + PL_BUF_MEM_TYPE_COUNT, + + // Note: This distinction only matters for discrete GPUs +}; + +// Structure describing a buffer. +struct pl_buf_params { + size_t size; // size in bytes (must be <= `pl_gpu_limits.max_buf_size`) + bool host_writable; // contents may be updated via pl_buf_write() + bool host_readable; // contents may be read back via pl_buf_read() + bool host_mapped; // create a persistent, RW mapping (pl_buf.data) + + // May be used as PL_DESC_BUF_UNIFORM or PL_DESC_BUF_TEXEL_UNIFORM. + // Requires `size <= pl_gpu_limits.max_ubo_size` + bool uniform; + + // May be used as PL_DESC_BUF_STORAGE or PL_DESC_BUF_TEXEL_STORAGE. + // Requires `size <= pl_gpu_limits.max_ssbo_size` + bool storable; + + // May be used as the source of vertex data for `pl_pass_run`. + bool drawable; + + // Provide a hint for the memory type you want to use when allocating + // this buffer's memory. + // + // Note: Restrictions may apply depending on the usage flags. In + // particular, allocating buffers with `uniform` or `storable` enabled from + // non-device memory will almost surely fail. + enum pl_buf_mem_type memory_type; + + // Setting this to a format with the `PL_FMT_CAP_TEXEL_*` capability allows + // this buffer to be used as a `PL_DESC_BUF_TEXEL_*`, when `uniform` and + // `storage` are respectively also enabled. + pl_fmt format; + + // At most one of `export_handle` and `import_handle` can be set for a + // buffer. + + // Setting this indicates that the memory backing this buffer should be + // shared with external APIs, If so, this must be exactly *one* of + // `pl_gpu.export_caps.buf`. + enum pl_handle_type export_handle; + + // Setting this indicates that the memory backing this buffer will be + // imported from an external API. If so, this must be exactly *one* of + // `pl_gpu.import_caps.buf`. + enum pl_handle_type import_handle; + + // If the shared memory is being imported, the import handle must be + // specified here. Otherwise, this is ignored. + struct pl_shared_mem shared_mem; + + // If non-NULL, the buffer will be created with these contents. Otherwise, + // the initial data is undefined. Using this does *not* require setting + // host_writable. + const void *initial_data; + + // Arbitrary user data. libplacebo does not use this at all. + void *user_data; + + // Arbitrary identifying tag. Used only for debugging purposes. + pl_debug_tag debug_tag; +}; + +#define pl_buf_params(...) (&(struct pl_buf_params) { \ + .debug_tag = PL_DEBUG_TAG, \ + __VA_ARGS__ \ + }) + +// A generic buffer, which can be used for multiple purposes (texture transfer, +// storage buffer, uniform buffer, etc.) +// +// Note on efficiency: A pl_buf does not necessarily represent a true "buffer" +// object on the underlying graphics API. It may also refer to a sub-slice of +// a larger buffer, depending on the implementation details of the GPU. The +// bottom line is that users do not need to worry about the efficiency of using +// many small pl_buf objects. Having many small pl_bufs, even lots of few-byte +// vertex buffers, is designed to be completely fine. +// +// Thread-safety: Unsafe +typedef const struct pl_buf_t { + struct pl_buf_params params; + uint8_t *data; // for persistently mapped buffers, points to the first byte + + // If `params.handle_type` is set, this structure references the shared + // memory backing this buffer, via the requested handle type. + // + // While this buffer is not in an "exported" state, the contents of the + // memory are undefined. (See: `pl_buf_export`) + struct pl_shared_mem shared_mem; +} *pl_buf; + +// Create a buffer. The type of buffer depends on the parameters. The buffer +// parameters must adhere to the restrictions imposed by the pl_gpu_limits. +// Returns NULL on failure. +// +// For buffers with shared memory, the buffer is considered to be in an +// "exported" state by default, and may be used directly by the external API +// after being created (until the first libplacebo operation on the buffer). +PL_API pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params); +PL_API void pl_buf_destroy(pl_gpu gpu, pl_buf *buf); + +// This behaves like `pl_buf_create`, but if the buffer already exists and has +// incompatible parameters, it will get destroyed first. A buffer is considered +// "compatible" if it has the same buffer type and texel format, a size greater +// than or equal to the requested size, and it has a superset of the features +// the user requested. After this operation, the contents of the buffer are +// undefined. +// +// Note: Due to its unpredictability, it's not allowed to use this with +// `params->initial_data` being set. Similarly, it's not allowed on a buffer +// with `params->export_handle`. since this may invalidate the corresponding +// external API's handle. Conversely, it *is* allowed on a buffer with +// `params->host_mapped`, and the corresponding `buf->data` pointer *may* +// change as a result of doing so. +// +// Note: If the `user_data` alone changes, this does not trigger a buffer +// recreation. In theory, this can be used to detect when the buffer ended +// up being recreated. +PL_API bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params); + +// Update the contents of a buffer, starting at a given offset (must be a +// multiple of 4) and up to a given size, with the contents of *data. +// +// This function will block until the buffer is no longer in use. Use +// `pl_buf_poll` to perform non-blocking queries of buffer availability. +// +// Note: This function can incur synchronization overhead, so it shouldn't be +// used in tight loops. If you do need to loop (e.g. to perform a strided +// write), consider using host-mapped buffers, or fixing the memory in RAM, +// before calling this function. +PL_API void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset, + const void *data, size_t size); + +// Read back the contents of a buffer, starting at a given offset, storing the +// data into *dest. Returns whether successful. +// +// This function will block until the buffer is no longer in use. Use +// `pl_buf_poll` to perform non-blocking queries of buffer availability. +PL_API bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset, + void *dest, size_t size); + +// Copy `size` bytes from one buffer to another, reading from and writing to +// the respective offsets. +PL_API void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size); + +// Initiates a buffer export operation, allowing a buffer to be accessed by an +// external API. This is only valid for buffers with `params.handle_type`. +// Calling this twice in a row is a harmless no-op. Returns whether successful. +// +// There is no corresponding "buffer import" operation, the next libplacebo +// operation that touches the buffer (e.g. pl_tex_upload, but also pl_buf_write +// and pl_buf_read) will implicitly import the buffer back to libplacebo. Users +// must ensure that all pending operations made by the external API are fully +// completed before using it in libplacebo again. (Otherwise, the behaviour +// is undefined) +// +// Please note that this function returning does not mean the memory is +// immediately available as such. In general, it will mark a buffer as "in use" +// in the same way any other buffer operation would, and it is the user's +// responsibility to wait until `pl_buf_poll` returns false before accessing +// the memory from the external API. +// +// In terms of the access performed by this operation, it is not considered a +// "read" or "write" and therefore does not technically conflict with reads or +// writes to the buffer performed by the host (via mapped memory - any use of +// `pl_buf_read` or `pl_buf_write` would defeat the purpose of the export). +// However, restrictions made by the external API may apply that prevent this. +// +// The recommended use pattern is something like this: +// +// while (loop) { +// pl_buf buf = get_free_buffer(); // or block on pl_buf_poll +// // write to the buffer using the external API +// pl_tex_upload(gpu, /* ... buf ... */); // implicitly imports +// pl_buf_export(gpu, buf); +// } +// +// i.e. perform an external API operation, then use and immediately export the +// buffer in libplacebo, and finally wait until `pl_buf_poll` is false before +// re-using it in the external API. (Or get a new buffer in the meantime) +PL_API bool pl_buf_export(pl_gpu gpu, pl_buf buf); + +// Returns whether or not a buffer is currently "in use". This can either be +// because of a pending read operation, a pending write operation or a pending +// buffer export operation. Any access to the buffer by external APIs or via +// the host pointer (for host-mapped buffers) is forbidden while a buffer is +// "in use". The only exception to this rule is multiple reads, for example +// reading from a buffer with `pl_tex_upload` while simultaneously reading from +// it using mapped memory. +// +// The `timeout`, specified in nanoseconds, indicates how long to block for +// before returning. If set to 0, this function will never block, and only +// returns the current status of the buffer. The actual precision of the +// timeout may be significantly longer than one nanosecond, and has no upper +// bound. This function does not provide hard latency guarantees. This function +// may also return at any time, even if the buffer is still in use. If the user +// wishes to block until the buffer is definitely no longer in use, the +// recommended usage is: +// +// while (pl_buf_poll(gpu, buf, UINT64_MAX)) +// ; // do nothing +// +// Note: libplacebo operations on buffers are always internally synchronized, +// so this is only needed for host-mapped or externally exported buffers. +// However, it may be used to do non-blocking queries before calling blocking +// functions such as `pl_buf_read`. +// +// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly +// synchronized, meaning it can safely be called on a `pl_buf` that is in use +// by another thread. +PL_API bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout); + +enum pl_tex_sample_mode { + PL_TEX_SAMPLE_NEAREST, // nearest neighbour sampling + PL_TEX_SAMPLE_LINEAR, // linear filtering, requires PL_FMT_CAP_LINEAR + PL_TEX_SAMPLE_MODE_COUNT, +}; + +enum pl_tex_address_mode { + PL_TEX_ADDRESS_CLAMP, // clamp the nearest edge texel + PL_TEX_ADDRESS_REPEAT, // repeat (tile) the texture + PL_TEX_ADDRESS_MIRROR, // repeat (mirror) the texture + PL_TEX_ADDRESS_MODE_COUNT, +}; + +// Structure describing a texture. +struct pl_tex_params { + int w, h, d; // physical dimension; unused dimensions must be 0 + pl_fmt format; + + // The following bools describe what operations can be performed. The + // corresponding pl_fmt capability must be set for every enabled + // operation type. + // + // Note: For planar formats, it is also possible to set capabilities only + // supported by sub-planes. In this case, the corresponding functionality + // will be available for the sub-plane, but not the planar texture itself. + bool sampleable; // usable as a PL_DESC_SAMPLED_TEX + bool renderable; // usable as a render target (pl_pass_run) + // (must only be used with 2D textures) + bool storable; // usable as a storage image (PL_DESC_IMG_*) + bool blit_src; // usable as a blit source + bool blit_dst; // usable as a blit destination + bool host_writable; // may be updated with pl_tex_upload() + bool host_readable; // may be fetched with pl_tex_download() + + // Note: For `blit_src`, `blit_dst`, the texture must either be + // 2-dimensional or `pl_gpu_limits.blittable_1d_3d` must be set. + + // At most one of `export_handle` and `import_handle` can be set for a + // texture. + + // Setting this indicates that the memory backing this texture should be + // shared with external APIs, If so, this must be exactly *one* of + // `pl_gpu.export_caps.tex`. + enum pl_handle_type export_handle; + + // Setting this indicates that the memory backing this texture will be + // imported from an external API. If so, this must be exactly *one* of + // `pl_gpu.import_caps.tex`. Mutually exclusive with `initial_data`. + enum pl_handle_type import_handle; + + // If the shared memory is being imported, the import handle must be + // specified here. Otherwise, this is ignored. + struct pl_shared_mem shared_mem; + + // If non-NULL, the texture will be created with these contents (tightly + // packed). Using this does *not* require setting host_writable. Otherwise, + // the initial data is undefined. Mutually exclusive with `import_handle`. + const void *initial_data; + + // Arbitrary user data. libplacebo does not use this at all. + void *user_data; + + // Arbitrary identifying tag. Used only for debugging purposes. + pl_debug_tag debug_tag; +}; + +#define pl_tex_params(...) (&(struct pl_tex_params) { \ + .debug_tag = PL_DEBUG_TAG, \ + __VA_ARGS__ \ + }) + +static inline int pl_tex_params_dimension(const struct pl_tex_params params) +{ + return params.d ? 3 : params.h ? 2 : 1; +} + +enum pl_sampler_type { + PL_SAMPLER_NORMAL, // gsampler2D, gsampler3D etc. + PL_SAMPLER_RECT, // gsampler2DRect + PL_SAMPLER_EXTERNAL, // gsamplerExternalOES + PL_SAMPLER_TYPE_COUNT, +}; + +// Conflates the following typical GPU API concepts: +// - texture itself +// - sampler state +// - staging buffers for texture upload +// - framebuffer objects +// - wrappers for swapchain framebuffers +// - synchronization needed for upload/rendering/etc. +// +// Essentially a pl_tex can be anything ranging from a normal texture, a wrapped +// external/real framebuffer, a framebuffer object + texture pair, a mapped +// texture (via pl_hwdec), or other sorts of things that can be sampled from +// and/or rendered to. +// +// Thread-safety: Unsafe +typedef const struct pl_tex_t *pl_tex; +struct pl_tex_t { + struct pl_tex_params params; + + // If `params.format` is a planar format, this contains `pl_tex` handles + // encapsulating individual texture planes. Conversely, if this is a + // sub-plane of a planar texture, `parent` points to the planar texture. + // + // Note: Calling `pl_tex_destroy` on sub-planes is undefined behavior. + pl_tex planes[4]; + pl_tex parent; + + // If `params.export_handle` is set, this structure references the shared + // memory backing this buffer, via the requested handle type. + // + // While this texture is not in an "exported" state, the contents of the + // memory are undefined. (See: `pl_tex_export`) + // + // Note: Due to vulkan driver limitations, `shared_mem.drm_format_mod` will + // currently always be set to DRM_FORMAT_MOD_INVALID. No guarantee can be + // made about the cross-driver compatibility of textures exported this way. + struct pl_shared_mem shared_mem; + + // If `params.sampleable` is true, this indicates the correct sampler type + // to use when sampling from this texture. + enum pl_sampler_type sampler_type; +}; + +// Create a texture (with undefined contents). Returns NULL on failure. This is +// assumed to be an expensive/rare operation, and may need to perform memory +// allocation or framebuffer creation. +PL_API pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params); +PL_API void pl_tex_destroy(pl_gpu gpu, pl_tex *tex); + +// This works like `pl_tex_create`, but if the texture already exists and has +// incompatible texture parameters, it will get destroyed first. A texture is +// considered "compatible" if it has the same texture format and sample/address +// mode and it supports a superset of the features the user requested. +// +// Even if the texture is not recreated, calling this function will still +// invalidate the contents of the texture. (Note: Because of this, +// `initial_data` may not be used with `pl_tex_recreate`. Doing so is an error) +// +// Note: If the `user_data` alone changes, this does not trigger a texture +// recreation. In theory, this can be used to detect when the texture ended +// up being recreated. +PL_API bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params); + +// Invalidates the contents of a texture. After this, the contents are fully +// undefined. +PL_API void pl_tex_invalidate(pl_gpu gpu, pl_tex tex); + +union pl_clear_color { + float f[4]; + int32_t i[4]; + uint32_t u[4]; +}; + +// Clear the dst texture with the given color (rgba). This is functionally +// identical to a blit operation, which means `dst->params.blit_dst` must be +// set. +PL_API void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color); + +// Wrapper for `pl_tex_clear_ex` which only works for floating point textures. +PL_API void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]); + +struct pl_tex_blit_params { + // The texture to blit from. Must have `params.blit_src` enabled. + pl_tex src; + + // The texture to blit to. Must have `params.blit_dst` enabled, and a + // format that is loosely compatible with `src`. This essentially means + // that they must have the same `internal_size`. Additionally, UINT + // textures can only be blitted to other UINT textures, and SINT textures + // can only be blitted to other SINT textures. + pl_tex dst; + + // The region of the source texture to blit. Must be within the texture + // bounds of `src`. May be flipped. (Optional) + pl_rect3d src_rc; + + // The region of the destination texture to blit into. Must be within the + // texture bounds of `dst`. May be flipped. Areas outside of `dst_rc` in + // `dst` are preserved. (Optional) + pl_rect3d dst_rc; + + // If `src_rc` and `dst_rc` have different sizes, the texture will be + // scaled using the given texture sampling mode. + enum pl_tex_sample_mode sample_mode; +}; + +#define pl_tex_blit_params(...) (&(struct pl_tex_blit_params) { __VA_ARGS__ }) + +// Copy a sub-rectangle from one texture to another. +PL_API void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params); + +// Structure describing a texture transfer operation. +struct pl_tex_transfer_params { + // Texture to transfer to/from. Depending on the type of the operation, + // this must have params.host_writable (uploads) or params.host_readable + // (downloads) set, respectively. + pl_tex tex; + + // Note: Superfluous parameters are ignored, i.e. for a 1D texture, the y + // and z fields of `rc`, as well as the corresponding pitches, are ignored. + // In all other cases, the pitch must be large enough to contain the + // corresponding dimension of `rc`, and the `rc` must be normalized and + // fully contained within the image dimensions. Missing fields in the `rc` + // are inferred from the image size. If unset, the pitch is inferred + // from `rc` (that is, it's assumed that the data is tightly packed in the + // buffer). Otherwise, `row_pitch` *must* be a multiple of + // `tex->params.format->texel_align`, and `depth_pitch` must be a multiple + // of `row_pitch`. + pl_rect3d rc; // region of the texture to transfer + size_t row_pitch; // the number of bytes separating image rows + size_t depth_pitch; // the number of bytes separating image planes + + // An optional timer to report the approximate duration of the texture + // transfer to. Note that this is only an approximation, since the actual + // texture transfer may happen entirely in the background (in particular, + // for implementations with asynchronous transfer capabilities). It's also + // not guaranteed that all GPUs support this. + pl_timer timer; + + // An optional callback to fire after the operation completes. If this is + // specified, then the operation is performed asynchronously. Note that + // transfers to/from buffers are always asynchronous, even without, this + // field, so it's more useful for `ptr` transfers. (Though it can still be + // helpful to avoid having to manually poll buffers all the time) + // + // When this is *not* specified, uploads from `ptr` are still asynchronous + // but require a host memcpy, while downloads from `ptr` are blocking. As + // such, it's recommended to always try using asynchronous texture + // transfers wherever possible. + // + // Note: Requires `pl_gpu_limits.callbacks` + // + // Note: Callbacks are implicitly synchronized, meaning that callbacks are + // guaranteed to never execute concurrently with other callbacks. However, + // they may execute from any thread that the `pl_gpu` is used on. + void (*callback)(void *priv); + void *priv; // arbitrary user data + + // For the data source/target of a transfer operation, there are two valid + // options: + // + // 1. Transferring to/from a buffer: (requires `pl_gpu_limits.buf_transfer`) + pl_buf buf; // buffer to use + size_t buf_offset; // offset of data within buffer, should be a + // multiple of `tex->params.format->texel_size` + // 2. Transferring to/from host memory directly: + void *ptr; // address of data + bool no_import; // always use memcpy, bypassing host ptr import + + // Note: The contents of the memory region / buffer must exactly match the + // texture format; i.e. there is no explicit conversion between formats. +}; + +#define pl_tex_transfer_params(...) (&(struct pl_tex_transfer_params) { __VA_ARGS__ }) + +// Upload data to a texture. Returns whether successful. +PL_API bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params); + +// Download data from a texture. Returns whether successful. +PL_API bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params); + +// Returns whether or not a texture is currently "in use". This can either be +// because of a pending read operation, a pending write operation or a pending +// texture export operation. Note that this function's usefulness is extremely +// limited under ordinary circumstances. In practically all cases, textures do +// not need to be directly synchronized by the user, except when interfacing +// with external libraries. This function should NOT, however, be used as a +// crutch to avoid having to implement semaphore-based synchronization. Use +// the API-specific functions such as `pl_vulkan_hold/release` for that. +// +// A good example of a use case in which this function is required is when +// interoperating with external memory management that needs to know when an +// imported texture is safe to free / reclaim internally, in which case +// semaphores are insufficient because memory management is a host operation. +// +// The `timeout`, specified in nanoseconds, indicates how long to block for +// before returning. If set to 0, this function will never block, and only +// returns the current status of the texture. The actual precision of the +// timeout may be significantly longer than one nanosecond, and has no upper +// bound. This function does not provide hard latency guarantees. This function +// may also return at any time, even if the texture is still in use. If the +// user wishes to block until the texture is definitely no longer in use, the +// recommended usage is: +// +// while (pl_tex_poll(gpu, buf, UINT64_MAX)) +// ; // do nothing +// +// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly +// synchronized, meaning it can safely be called on a `pl_tex` that is in use +// by another thread. +PL_API bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout); + +// Data type of a shader input variable (e.g. uniform, or UBO member) +enum pl_var_type { + PL_VAR_INVALID = 0, + PL_VAR_SINT, // C: int GLSL: int/ivec + PL_VAR_UINT, // C: unsigned int GLSL: uint/uvec + PL_VAR_FLOAT, // C: float GLSL: float/vec/mat + PL_VAR_TYPE_COUNT +}; + +// Returns the host size (in bytes) of a pl_var_type. +PL_API size_t pl_var_type_size(enum pl_var_type type); + +// Represents a shader input variable (concrete data, e.g. vector, matrix) +struct pl_var { + const char *name; // name as used in the shader + enum pl_var_type type; + // The total number of values is given by dim_v * dim_m. For example, a + // vec2 would have dim_v = 2 and dim_m = 1. A mat3x4 would have dim_v = 4 + // and dim_m = 3. + int dim_v; // vector dimension + int dim_m; // matrix dimension (number of columns, see below) + int dim_a; // array dimension +}; + +// Helper functions for constructing the most common pl_vars, with names +// corresponding to their corresponding GLSL built-in types. +PL_API struct pl_var pl_var_float(const char *name); +PL_API struct pl_var pl_var_vec2(const char *name); +PL_API struct pl_var pl_var_vec3(const char *name); +PL_API struct pl_var pl_var_vec4(const char *name); +PL_API struct pl_var pl_var_mat2(const char *name); +PL_API struct pl_var pl_var_mat2x3(const char *name); +PL_API struct pl_var pl_var_mat2x4(const char *name); +PL_API struct pl_var pl_var_mat3(const char *name); +PL_API struct pl_var pl_var_mat3x4(const char *name); +PL_API struct pl_var pl_var_mat4x2(const char *name); +PL_API struct pl_var pl_var_mat4x3(const char *name); +PL_API struct pl_var pl_var_mat4(const char *name); +PL_API struct pl_var pl_var_int(const char *name); +PL_API struct pl_var pl_var_ivec2(const char *name); +PL_API struct pl_var pl_var_ivec3(const char *name); +PL_API struct pl_var pl_var_ivec4(const char *name); +PL_API struct pl_var pl_var_uint(const char *name); +PL_API struct pl_var pl_var_uvec2(const char *name); +PL_API struct pl_var pl_var_uvec3(const char *name); +PL_API struct pl_var pl_var_uvec4(const char *name); + +struct pl_named_var { + const char *glsl_name; + struct pl_var var; +}; + +// The same list as above, tagged by name and terminated with a {0} entry. +PL_API extern const struct pl_named_var pl_var_glsl_types[]; + +// Efficient helper function for performing a lookup in the above array. +// Returns NULL if the variable is not legal. Note that the array dimension is +// ignored, since it's usually part of the variable name and not the type name. +PL_API const char *pl_var_glsl_type_name(struct pl_var var); + +// Converts a pl_fmt to an "equivalent" pl_var. Equivalent in this sense means +// that the pl_var's type will be the same as the vertex's sampled type (e.g. +// PL_FMT_UNORM gets turned into PL_VAR_FLOAT). +PL_API struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name); + +// Describes the memory layout of a variable, relative to some starting location +// (typically the offset within a uniform/storage/pushconstant buffer) +// +// Note on matrices: All GPUs expect column major matrices, for both buffers and +// input variables. Care needs to be taken to avoid trying to use e.g. a +// pl_matrix3x3 (which is row major) directly as a pl_var_update.data! +// +// In terms of the host layout, a column-major matrix (e.g. matCxR) with C +// columns and R rows is treated like an array vecR[C]. The `stride` here refers +// to the separation between these array elements, i.e. the separation between +// the individual columns. +// +// Visualization of a mat4x3: +// +// 0 1 2 3 <- columns +// 0 [ (A) (D) (G) (J) ] +// 1 [ (B) (E) (H) (K) ] +// 2 [ (C) (F) (I) (L) ] +// ^ rows +// +// Layout in GPU memory: (stride=16, size=60) +// +// [ A B C ] X <- column 0, offset +0 +// [ D E F ] X <- column 1, offset +16 +// [ G H I ] X <- column 2, offset +32 +// [ J K L ] <- column 3, offset +48 +// +// Note the lack of padding on the last column in this example. +// In general: size <= stride * dim_m +// +// C representation: (stride=12, size=48) +// +// { { A, B, C }, +// { D, E, F }, +// { G, H, I }, +// { J, K, L } } +// +// Note on arrays: `stride` represents both the stride between elements of a +// matrix, and the stride between elements of an array. That is, there is no +// distinction between the columns of a matrix and the rows of an array. For +// example, a mat2[10] and a vec2[20] share the same pl_var_layout - the stride +// would be sizeof(vec2) and the size would be sizeof(vec2) * 2 * 10. +// +// For non-array/matrix types, `stride` is equal to `size`. + +struct pl_var_layout { + size_t offset; // the starting offset of the first byte + size_t stride; // the delta between two elements of an array/matrix + size_t size; // the total size of the input +}; + +// Returns the host layout of an input variable as required for a +// tightly-packed, byte-aligned C data type, given a starting offset. +PL_API struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var); + +// Returns the GLSL std140 layout of an input variable given a current buffer +// offset, as required for a buffer descriptor of type PL_DESC_BUF_UNIFORM +// +// The normal way to use this function is when calculating the size and offset +// requirements of a uniform buffer in an incremental fashion, to calculate the +// new offset of the next variable in this buffer. +PL_API struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var); + +// Returns the GLSL std430 layout of an input variable given a current buffer +// offset, as required for a buffer descriptor of type PL_DESC_BUF_STORAGE, and +// for push constants. +PL_API struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var); + +// Convenience definitions / friendly names for these +#define pl_buf_uniform_layout pl_std140_layout +#define pl_buf_storage_layout pl_std430_layout +#define pl_push_constant_layout pl_std430_layout + +// Like memcpy, but copies bytes from `src` to `dst` in a manner governed by +// the stride and size of `dst_layout` as well as `src_layout`. Also takes +// into account the respective `offset`. +PL_API void memcpy_layout(void *dst, struct pl_var_layout dst_layout, + const void *src, struct pl_var_layout src_layout); + +// Represents a compile-time constant. +struct pl_constant { + enum pl_var_type type; // constant data type + uint32_t id; // GLSL `constant_id` + size_t offset; // byte offset in `constant_data` +}; + +// Represents a vertex attribute. +struct pl_vertex_attrib { + const char *name; // name as used in the shader + pl_fmt fmt; // data format (must have PL_FMT_CAP_VERTEX) + size_t offset; // byte offset into the vertex struct + int location; // vertex location (as used in the shader) +}; + +// Returns an abstract namespace index for a given descriptor type. This will +// always be a value >= 0 and < PL_DESC_TYPE_COUNT. Implementations can use +// this to figure out which descriptors may share the same value of `binding`. +// Bindings must only be unique for all descriptors within the same namespace. +PL_API int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type); + +// Access mode of a shader input descriptor. +enum pl_desc_access { + PL_DESC_ACCESS_READWRITE, + PL_DESC_ACCESS_READONLY, + PL_DESC_ACCESS_WRITEONLY, + PL_DESC_ACCESS_COUNT, +}; + +// Returns the GLSL syntax for a given access mode (e.g. "readonly"). +PL_API const char *pl_desc_access_glsl_name(enum pl_desc_access mode); + +// Represents a shader descriptor (e.g. texture or buffer binding) +struct pl_desc { + const char *name; // name as used in the shader + enum pl_desc_type type; + + // The binding of this descriptor, as used in the shader. All bindings + // within a namespace must be unique. (see: pl_desc_namespace) + int binding; + + // For storage images and storage buffers, this can be used to restrict + // the type of access that may be performed on the descriptor. Ignored for + // the other descriptor types (uniform buffers and sampled textures are + // always read-only). + enum pl_desc_access access; +}; + +// Framebuffer blending mode (for raster passes) +enum pl_blend_mode { + PL_BLEND_ZERO, + PL_BLEND_ONE, + PL_BLEND_SRC_ALPHA, + PL_BLEND_ONE_MINUS_SRC_ALPHA, + PL_BLEND_MODE_COUNT, +}; + +struct pl_blend_params { + enum pl_blend_mode src_rgb; + enum pl_blend_mode dst_rgb; + enum pl_blend_mode src_alpha; + enum pl_blend_mode dst_alpha; +}; + +#define pl_blend_params(...) (&(struct pl_blend_params) { __VA_ARGS__ }) + +// Typical alpha compositing +PL_API extern const struct pl_blend_params pl_alpha_overlay; + +enum pl_prim_type { + PL_PRIM_TRIANGLE_LIST, + PL_PRIM_TRIANGLE_STRIP, + PL_PRIM_TYPE_COUNT, +}; + +enum pl_index_format { + PL_INDEX_UINT16 = 0, + PL_INDEX_UINT32, + PL_INDEX_FORMAT_COUNT, +}; + +enum pl_pass_type { + PL_PASS_INVALID = 0, + PL_PASS_RASTER, // vertex+fragment shader + PL_PASS_COMPUTE, // compute shader (requires `pl_gpu.glsl.compute`) + PL_PASS_TYPE_COUNT, +}; + +// Description of a rendering pass. It conflates the following: +// - GLSL shader(s) and its list of inputs +// - target parameters (for raster passes) +struct pl_pass_params { + enum pl_pass_type type; + + // Input variables. + struct pl_var *variables; + int num_variables; + + // Input descriptors. + struct pl_desc *descriptors; + int num_descriptors; + + // Compile-time specialization constants. + struct pl_constant *constants; + int num_constants; + + // Initial data for the specialization constants. Optional. If NULL, + // specialization constants receive the values from the shader text. + void *constant_data; + + // Push constant region. Must be be a multiple of 4 <= limits.max_pushc_size + size_t push_constants_size; + + // The shader text in GLSL. For PL_PASS_RASTER, this is interpreted + // as a fragment shader. For PL_PASS_COMPUTE, this is interpreted as + // a compute shader. + const char *glsl_shader; + + // --- type==PL_PASS_RASTER only + + // Describes the interpretation and layout of the vertex data. + enum pl_prim_type vertex_type; + struct pl_vertex_attrib *vertex_attribs; + int num_vertex_attribs; + size_t vertex_stride; // must be a multiple of limits.align_vertex_stride + + // The vertex shader itself. + const char *vertex_shader; + + // Target format. The format must support PL_FMT_CAP_RENDERABLE. The + // resulting pass may only be used on textures that have a format with a + // `pl_fmt.signature` compatible to this format. + pl_fmt target_format; + + // Target blending mode. If this is NULL, blending is disabled. Otherwise, + // the `target_format` must also support PL_FMT_CAP_BLENDABLE. + const struct pl_blend_params *blend_params; + + // If false, the target's existing contents will be discarded before the + // pass is run. (Semantically equivalent to calling pl_tex_invalidate + // before every pl_pass_run, but slightly more efficient) + // + // Specifying `blend_params` requires `load_target` to be true. + bool load_target; + + // --- Deprecated / removed fields. + PL_DEPRECATED const uint8_t *cached_program; // Non-functional + PL_DEPRECATED size_t cached_program_len; +}; + +#define pl_pass_params(...) (&(struct pl_pass_params) { __VA_ARGS__ }) + +// Conflates the following typical GPU API concepts: +// - various kinds of shaders +// - rendering pipelines +// - descriptor sets, uniforms, other bindings +// - all synchronization necessary +// - the current values of all inputs +// +// Thread-safety: Unsafe +typedef const struct pl_pass_t { + struct pl_pass_params params; +} *pl_pass; + +// Compile a shader and create a render pass. This is a rare/expensive +// operation and may take a significant amount of time, even if a cached +// program is used. Returns NULL on failure. +PL_API pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params); +PL_API void pl_pass_destroy(pl_gpu gpu, pl_pass *pass); + +struct pl_desc_binding { + const void *object; // pl_* object with type corresponding to pl_desc_type + + // For PL_DESC_SAMPLED_TEX, this can be used to configure the sampler. + enum pl_tex_address_mode address_mode; + enum pl_tex_sample_mode sample_mode; +}; + +struct pl_var_update { + int index; // index into params.variables[] + const void *data; // pointer to raw byte data corresponding to pl_var_host_layout() +}; + +struct pl_pass_run_params { + pl_pass pass; + + // If present, the shader will be re-specialized with the new constants + // provided. This is a significantly cheaper operation than recompiling a + // brand new shader, but should still be avoided if possible. + // + // Leaving it as NULL re-uses the existing specialization values. Ignored + // if the shader has no specialization constants. Guaranteed to be a no-op + // if the values have not changed since the last invocation. + void *constant_data; + + // This list only contains descriptors/variables which have changed + // since the previous invocation. All non-mentioned variables implicitly + // preserve their state from the last invocation. + struct pl_var_update *var_updates; + int num_var_updates; + + // This list contains all descriptors used by this pass. It must + // always be filled, even if the descriptors haven't changed. The order + // must match that of pass->params.descriptors + struct pl_desc_binding *desc_bindings; + + // The push constants for this invocation. This must always be set and + // fully defined for every invocation if params.push_constants_size > 0. + void *push_constants; + + // An optional timer to report the approximate runtime of this shader pass + // invocation to. Note that this is only an approximation, since shaders + // may overlap their execution times and contend for GPU time. + pl_timer timer; + + // --- pass->params.type==PL_PASS_RASTER only + + // Target must be a 2D texture, `target->params.renderable` must be true, + // and `target->params.format->signature` must match the signature provided + // in `pass->params.target_format`. + // + // If the viewport or scissors are left blank, they are inferred from + // target->params. + // + // WARNING: Rendering to a *target that is being read from by the same + // shader is undefined behavior. In general, trying to bind the same + // resource multiple times to the same shader is undefined behavior. + pl_tex target; + pl_rect2d viewport; // screen space viewport (must be normalized) + pl_rect2d scissors; // target render scissors (must be normalized) + + // Number of vertices to render + int vertex_count; + + // Vertex data may be provided in one of two forms: + // + // 1. Drawing from host memory directly + const void *vertex_data; + // 2. Drawing from a vertex buffer (requires `vertex_buf->params.drawable`) + pl_buf vertex_buf; + size_t buf_offset; + + // (Optional) Index data may be provided in the form given by `index_fmt`. + // These will be used for instanced rendering. Similar to vertex data, this + // can be provided in two forms: + // 1. From host memory + const void *index_data; + enum pl_index_format index_fmt; + // 2. From an index buffer (requires `index_buf->params.drawable`) + pl_buf index_buf; + size_t index_offset; + // Note: Drawing from an index buffer requires vertex data to also be + // present in buffer form, i.e. it's forbidden to mix `index_buf` with + // `vertex_data` (though vice versa is allowed). + + // --- pass->params.type==PL_PASS_COMPUTE only + + // Number of work groups to dispatch per dimension (X/Y/Z). Must be <= the + // corresponding index of limits.max_dispatch + int compute_groups[3]; +}; + +#define pl_pass_run_params(...) (&(struct pl_pass_run_params) { __VA_ARGS__ }) + +// Execute a render pass. +PL_API void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params); + +// This is semantically a no-op, but it provides a hint that you want to flush +// any partially queued up commands and begin execution. There is normally no +// need to call this, because queued commands will always be implicitly flushed +// whenever necessary to make forward progress on commands like `pl_buf_poll`, +// or when submitting a frame to a swapchain for display. In fact, calling this +// function can negatively impact performance, because some GPUs rely on being +// able to re-order and modify queued commands in order to enable optimizations +// retroactively. +// +// The only time this might be beneficial to call explicitly is if you're doing +// lots of offline processing, i.e. you aren't rendering to a swapchain but to +// textures that you download from again. In that case you should call this +// function after each "work item" to ensure good parallelism between them. +// +// It's worth noting that this function may block if you're over-feeding the +// GPU without waiting for existing results to finish. +PL_API void pl_gpu_flush(pl_gpu gpu); + +// This is like `pl_gpu_flush` but also blocks until the GPU is fully idle +// before returning. Using this in your rendering loop is seriously disadvised, +// and almost never the right solution. The intended use case is for deinit +// logic, where users may want to force the all pending GPU operations to +// finish so they can clean up their state more easily. +// +// After this operation is called, it's guaranteed that all pending buffer +// operations are complete - i.e. `pl_buf_poll` is guaranteed to return false. +// It's also guaranteed that any outstanding timer query results are available. +// +// Note: If you only care about buffer operations, you can accomplish this more +// easily by using `pl_buf_poll` with the timeout set to `UINT64_MAX`. But if +// you have many buffers it may be more convenient to call this function +// instead. The difference is that this function will also affect e.g. renders +// to a `pl_swapchain`. +PL_API void pl_gpu_finish(pl_gpu gpu); + +// Returns true if the GPU is considered to be in a "failed" state, which +// during normal operation is typically the result of things like the device +// being lost (due to e.g. power management). +// +// If this returns true, users *should* destroy and recreate the `pl_gpu`, +// including all associated resources, via the appropriate mechanism. +PL_API bool pl_gpu_is_failed(pl_gpu gpu); + + +// Deprecated objects and functions: + +// A generic synchronization object intended for use with an external API. This +// is not required when solely using libplacebo API functions, as all required +// synchronisation is done internally. This comes in the form of a pair of +// semaphores - one to synchronize access in each direction. +// +// Thread-safety: Unsafe +typedef const struct pl_sync_t { + enum pl_handle_type handle_type; + + // This handle is signalled by the `pl_gpu`, and waited on by the user. It + // fires when it is safe for the user to access the shared resource. + union pl_handle wait_handle; + + // This handle is signalled by the user, and waited on by the `pl_gpu`. It + // must fire when the user has finished accessing the shared resource. + union pl_handle signal_handle; +} *pl_sync; + +// Create a synchronization object. Returns NULL on failure. +// +// `handle_type` must be exactly *one* of `pl_gpu.export_caps.sync`, and +// indicates which type of handle to generate for sharing this sync object. +// +// Deprecated in favor of API-specific semaphore creation operations such as +// `pl_vulkan_sem_create`. +PL_DEPRECATED PL_API pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type); + +// Destroy a `pl_sync`. Note that this invalidates the externally imported +// semaphores. Users should therefore make sure that all operations that +// wait on or signal any of the semaphore have been fully submitted and +// processed by the external API before destroying the `pl_sync`. +// +// Despite this, it's safe to destroy a `pl_sync` if the only pending +// operations that involve it are internal to libplacebo. +PL_DEPRECATED PL_API void pl_sync_destroy(pl_gpu gpu, pl_sync *sync); + +// Initiates a texture export operation, allowing a texture to be accessed by +// an external API. Returns whether successful. After this operation +// successfully returns, it is guaranteed that `sync->wait_handle` will +// eventually be signalled. For APIs where this is relevant, the image layout +// should be specified as "general", e.g. `GL_LAYOUT_GENERAL_EXT` for OpenGL. +// +// There is no corresponding "import" operation - the next operation that uses +// a texture will implicitly import the texture. Valid API usage requires that +// the user *must* submit a semaphore signal operation on `sync->signal_handle` +// before doing so. Not doing so is undefined behavior and may very well +// deadlock the calling process and/or the graphics card! +// +// Note that despite this restriction, it is always valid to call +// `pl_tex_destroy`, even if the texture is in an exported state, without +// having to signal the corresponding sync object first. +// +// Deprecated in favor of API-specific synchronization mechanisms such as +// `pl_vulkan_hold/release_ex`. +PL_DEPRECATED PL_API bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync); + + +PL_API_END + +#endif // LIBPLACEBO_GPU_H_ diff --git a/src/include/libplacebo/log.h b/src/include/libplacebo/log.h new file mode 100644 index 0000000..b24c931 --- /dev/null +++ b/src/include/libplacebo/log.h @@ -0,0 +1,113 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_LOG_H_ +#define LIBPLACEBO_LOG_H_ + +#include <libplacebo/config.h> +#include <libplacebo/common.h> + +PL_API_BEGIN + +// The log level associated with a given log message. +enum pl_log_level { + PL_LOG_NONE = 0, + PL_LOG_FATAL, // results in total loss of function of a major component + PL_LOG_ERR, // serious error; may result in degraded function + PL_LOG_WARN, // warning; potentially bad, probably user-relevant + PL_LOG_INFO, // informational message, also potentially harmless errors + PL_LOG_DEBUG, // verbose debug message, informational + PL_LOG_TRACE, // very noisy trace of activity,, usually benign + PL_LOG_ALL = PL_LOG_TRACE, +}; + +struct pl_log_params { + // Logging callback. All messages, informational or otherwise, will get + // redirected to this callback. The logged messages do not include trailing + // newlines. Optional. + void (*log_cb)(void *log_priv, enum pl_log_level level, const char *msg); + void *log_priv; + + // The current log level. Controls the level of message that will be + // redirected to the log callback. Setting this to PL_LOG_ALL means all + // messages will be forwarded, but doing so indiscriminately can result + // in increased CPU usage as it may enable extra debug paths based on the + // configured log level. + enum pl_log_level log_level; +}; + +#define pl_log_params(...) (&(struct pl_log_params) { __VA_ARGS__ }) +PL_API extern const struct pl_log_params pl_log_default_params; + +// Thread-safety: Safe +// +// Note: In any context in which `pl_log` is used, users may also pass NULL +// to disable logging. In other words, NULL is a valid `pl_log`. +typedef const struct pl_log_t { + struct pl_log_params params; +} *pl_log; + +#define pl_log_glue1(x, y) x##y +#define pl_log_glue2(x, y) pl_log_glue1(x, y) +// Force a link error in the case of linking against an incompatible API +// version. +#define pl_log_create pl_log_glue2(pl_log_create_, PL_API_VER) +// Creates a pl_log. `api_ver` is for historical reasons and ignored currently. +// `params` defaults to `&pl_log_default_params` if left as NULL. +// +// Note: As a general rule, any `params` struct used as an argument to a +// function need only live until the corresponding function returns. +PL_API pl_log pl_log_create(int api_ver, const struct pl_log_params *params); + +// Destroy a `pl_log` object. +// +// Note: As a general rule, all `_destroy` functions take the pointer to the +// object to free as their parameter. This pointer is overwritten by NULL +// afterwards. Calling a _destroy function on &{NULL} is valid, but calling it +// on NULL itself is invalid. +PL_API void pl_log_destroy(pl_log *log); + +// Update the parameters of a `pl_log` without destroying it. This can be +// used to change the log function, log context or log level retroactively. +// `params` defaults to `&pl_log_default_params` if left as NULL. +// +// Returns the previous params, atomically. +PL_API struct pl_log_params pl_log_update(pl_log log, const struct pl_log_params *params); + +// Like `pl_log_update` but only updates the log level, leaving the log +// callback intact. +// +// Returns the previous log level, atomically. +PL_API enum pl_log_level pl_log_level_update(pl_log log, enum pl_log_level level); + +// Two simple, stream-based loggers. You can use these as the log_cb. If you +// also set log_priv to a FILE* (e.g. stdout or stderr) it will be printed +// there; otherwise, it will be printed to stdout or stderr depending on the +// log level. +// +// The version with colors will use ANSI escape sequences to indicate the log +// level. The version without will use explicit prefixes. +PL_API void pl_log_simple(void *stream, enum pl_log_level level, const char *msg); +PL_API void pl_log_color(void *stream, enum pl_log_level level, const char *msg); + +// Backwards compatibility with older versions of libplacebo +#define pl_context pl_log +#define pl_context_params pl_log_params + +PL_API_END + +#endif // LIBPLACEBO_LOG_H_ diff --git a/src/include/libplacebo/meson.build b/src/include/libplacebo/meson.build new file mode 100644 index 0000000..2f4631e --- /dev/null +++ b/src/include/libplacebo/meson.build @@ -0,0 +1,6 @@ +sources += configure_file( + input: 'config.h.in', + output: 'config.h', + install_dir: get_option('includedir') / meson.project_name(), + configuration: conf_public, +) diff --git a/src/include/libplacebo/opengl.h b/src/include/libplacebo/opengl.h new file mode 100644 index 0000000..46597b2 --- /dev/null +++ b/src/include/libplacebo/opengl.h @@ -0,0 +1,230 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_OPENGL_H_ +#define LIBPLACEBO_OPENGL_H_ + +#include <string.h> + +#include <libplacebo/gpu.h> +#include <libplacebo/swapchain.h> + +PL_API_BEGIN + +// Note on thread safety: The thread safety of `pl_opengl` and any associated +// GPU objects follows the same thread safety rules as the underlying OpenGL +// context. In other words, they must only be called from the thread the OpenGL +// context is current on. + +typedef const struct pl_opengl_t { + pl_gpu gpu; + + // Detected GL version + int major, minor; + + // List of GL/EGL extensions, provided for convenience + const char * const *extensions; + int num_extensions; +} *pl_opengl; + +static inline bool pl_opengl_has_ext(pl_opengl gl, const char *ext) +{ + for (int i = 0; i < gl->num_extensions; i++) + if (!strcmp(ext, gl->extensions[i])) + return true; + return false; +} + +typedef void (*pl_voidfunc_t)(void); + +struct pl_opengl_params { + // Main gl*GetProcAddr function. This will be used to load all GL/EGL + // functions. Optional - if unspecified, libplacebo will default to an + // internal loading logic which should work on most platforms. + pl_voidfunc_t (*get_proc_addr_ex)(void *proc_ctx, const char *procname); + void *proc_ctx; + + // Simpler API for backwards compatibility / convenience. (This one + // directly matches the signature of most gl*GetProcAddr library functions) + pl_voidfunc_t (*get_proc_addr)(const char *procname); + + // Enable OpenGL debug report callbacks. May have little effect depending + // on whether or not the GL context was initialized with appropriate + // debugging enabled. + bool debug; + + // Allow the use of (suspected) software rasterizers and renderers. These + // can be useful for debugging purposes, but normally, their use is + // undesirable when GPU-accelerated processing is expected. + bool allow_software; + + // Restrict the maximum allowed GLSL version. (Mainly for testing) + int max_glsl_version; + + // Optional. Required when importing/exporting dmabufs as textures. + void *egl_display; + void *egl_context; + + // Optional callbacks to bind/release the OpenGL context on the current + // thread. If these are specified, then the resulting `pl_gpu` will have + // `pl_gpu_limits.thread_safe` enabled, and may therefore be used from any + // thread without first needing to bind the OpenGL context. + // + // If the user is re-using the same OpenGL context in non-libplacebo code, + // then these callbacks should include whatever synchronization is + // necessary to prevent simultaneous use between libplacebo and the user. + bool (*make_current)(void *priv); + void (*release_current)(void *priv); + void *priv; +}; + +// Default/recommended parameters +#define pl_opengl_params(...) (&(struct pl_opengl_params) { __VA_ARGS__ }) +PL_API extern const struct pl_opengl_params pl_opengl_default_params; + +// Creates a new OpenGL renderer based on the given parameters. This will +// internally use whatever platform-defined mechanism (WGL, X11, EGL) is +// appropriate for loading the OpenGL function calls, so the user doesn't need +// to pass in a `getProcAddress` callback. If `params` is left as NULL, it +// defaults to `&pl_opengl_default_params`. The context must be active when +// calling this function, and must remain active whenever calling any +// libplacebo function on the resulting `pl_opengl` or `pl_gpu`. +// +// Note that creating multiple `pl_opengl` instances from the same OpenGL +// context is undefined behavior. +PL_API pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params); + +// All resources allocated from the `pl_gpu` contained by this `pl_opengl` must +// be explicitly destroyed by the user before calling `pl_opengl_destroy`. +PL_API void pl_opengl_destroy(pl_opengl *gl); + +// For a `pl_gpu` backed by `pl_opengl`, this function can be used to retrieve +// the underlying `pl_opengl`. Returns NULL for any other type of `gpu`. +PL_API pl_opengl pl_opengl_get(pl_gpu gpu); + +struct pl_opengl_framebuffer { + // ID of the framebuffer, or 0 to use the context's default framebuffer. + int id; + + // If true, then the framebuffer is assumed to be "flipped" relative to + // normal GL semantics, i.e. set this to `true` if the first pixel is the + // top left corner. + bool flipped; +}; + +struct pl_opengl_swapchain_params { + // Set this to the platform-specific function to swap buffers, e.g. + // glXSwapBuffers, eglSwapBuffers etc. This will be called internally by + // `pl_swapchain_swap_buffers`. Required, unless you never call that + // function. + void (*swap_buffers)(void *priv); + + // Initial framebuffer description. This can be changed later on using + // `pl_opengl_swapchain_update_fb`. + struct pl_opengl_framebuffer framebuffer; + + // Attempt forcing a specific latency. If this is nonzero, then + // `pl_swapchain_swap_buffers` will wait until fewer than N frames are "in + // flight" before returning. Setting this to a high number generally + // accomplished nothing, because the OpenGL driver typically limits the + // number of buffers on its own. But setting it to a low number like 2 or + // even 1 can reduce latency (at the cost of throughput). + int max_swapchain_depth; + + // Arbitrary user pointer that gets passed to `swap_buffers` etc. + void *priv; +}; + +#define pl_opengl_swapchain_params(...) (&(struct pl_opengl_swapchain_params) { __VA_ARGS__ }) + +// Creates an instance of `pl_swapchain` tied to the active context. +// Note: Due to OpenGL semantics, users *must* call `pl_swapchain_resize` +// before attempting to use this swapchain, otherwise calls to +// `pl_swapchain_start_frame` will fail. +PL_API pl_swapchain pl_opengl_create_swapchain(pl_opengl gl, + const struct pl_opengl_swapchain_params *params); + +// Update the framebuffer description. After calling this function, users +// *must* call `pl_swapchain_resize` before attempting to use the swapchain +// again, otherwise calls to `pl_swapchain_start_frame` will fail. +PL_API void pl_opengl_swapchain_update_fb(pl_swapchain sw, + const struct pl_opengl_framebuffer *fb); + +struct pl_opengl_wrap_params { + // The GLuint texture object itself. Optional. If no texture is provided, + // then only the opaque framebuffer `fbo` will be wrapped, leaving the + // resulting `pl_tex` object with some operations (such as sampling) being + // unsupported. + unsigned int texture; + + // The GLuint associated framebuffer. Optional. If this is not specified, + // then libplacebo will attempt creating a framebuffer from the provided + // texture object (if possible). + // + // Note: As a special case, if neither a texture nor an FBO are provided, + // this is equivalent to wrapping the OpenGL default framebuffer (id 0). + unsigned int framebuffer; + + // The image's dimensions (unused dimensions must be 0) + int width; + int height; + int depth; + + // Texture-specific fields: + // + // Note: These are only relevant if `texture` is provided. + + // The GLenum for the texture target to use, e.g. GL_TEXTURE_2D. Optional. + // If this is left as 0, the target is inferred from the number of + // dimensions. Users may want to set this to something specific like + // GL_TEXTURE_EXTERNAL_OES depending on the nature of the texture. + unsigned int target; + + // The texture's GLint sized internal format (e.g. GL_RGBA16F). Required. + int iformat; +}; + +#define pl_opengl_wrap_params(...) (&(struct pl_opengl_wrap_params) { __VA_ARGS__ }) + +// Wraps an external OpenGL object into a `pl_tex` abstraction. Due to the +// internally synchronized nature of OpenGL, no explicit synchronization +// is needed between libplacebo `pl_tex_` operations, and host accesses to +// the texture. Wrapping the same OpenGL texture multiple times is permitted. +// Note that this function transfers no ownership. +// +// This wrapper can be destroyed by simply calling `pl_tex_destroy` on it, +// which will *not* destroy the user-provided OpenGL texture or framebuffer. +// +// This function may fail, in which case it returns NULL. +PL_API pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params); + +// Analogous to `pl_opengl_wrap`, this function takes any `pl_tex` (including +// ones created by `pl_tex_create`) and unwraps it to expose the underlying +// OpenGL texture to the user. Note that this function transfers no ownership, +// i.e. the texture object and framebuffer shall not be destroyed by the user. +// +// Returns the OpenGL texture. `out_target` and `out_iformat` will be updated +// to hold the target type and internal format, respectively. (Optional) +// +// For renderable/blittable textures, `out_fbo` will be updated to the ID of +// the framebuffer attached to this texture, or 0 if there is none. (Optional) +PL_API unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex, unsigned int *out_target, + int *out_iformat, unsigned int *out_fbo); + +PL_API_END + +#endif // LIBPLACEBO_OPENGL_H_ diff --git a/src/include/libplacebo/options.h b/src/include/libplacebo/options.h new file mode 100644 index 0000000..e40f5e7 --- /dev/null +++ b/src/include/libplacebo/options.h @@ -0,0 +1,201 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_OPTIONS_H_ +#define LIBPLACEBO_OPTIONS_H_ + +#include <libplacebo/renderer.h> + +PL_API_BEGIN + +// High-level heap-managed struct containing storage for all options implied by +// pl_render_params, including a high-level interface for serializing, +// deserializing and interfacing with them in a programmatic way. + +typedef const struct pl_opt_t *pl_opt; +typedef struct pl_options_t { + // Non-NULL `params.*_params` pointers must always point into this struct + struct pl_render_params params; + + // Backing storage for all of the various rendering parameters. Whether + // or not these params are active is determined by whether or not + // `params.*_params` is set to this address or NULL. + struct pl_deband_params deband_params; + struct pl_sigmoid_params sigmoid_params; + struct pl_color_adjustment color_adjustment; + struct pl_peak_detect_params peak_detect_params; + struct pl_color_map_params color_map_params; + struct pl_dither_params dither_params; + struct pl_icc_params icc_params PL_DEPRECATED; + struct pl_cone_params cone_params; + struct pl_blend_params blend_params; + struct pl_deinterlace_params deinterlace_params; + struct pl_distort_params distort_params; + + // Backing storage for "custom" scalers. `params.upscaler` etc. will + // always be a pointer either to a built-in pl_filter_config, or one of + // these structs. `name`, `description` and `allowed` will always be + // valid for the respective type of filter config. + struct pl_filter_config upscaler; + struct pl_filter_config downscaler; + struct pl_filter_config plane_upscaler; + struct pl_filter_config plane_downscaler; + struct pl_filter_config frame_mixer; +} *pl_options; + +// Allocate a new set of render params, with internally backed storage for +// all parameters. Initialized to an "empty" config (PL_RENDER_DEFAULTS), +// equivalent to `&pl_render_fast_params`. To initialize the struct instead to +// the recommended default parameters, use `pl_options_reset` with +// `pl_render_default_params`. +// +// If `log` is provided, errors related to parsing etc. will be logged there. +PL_API pl_options pl_options_alloc(pl_log log); +PL_API void pl_options_free(pl_options *opts); + +// Resets all options to their default values from a given struct. If `preset` +// is NULL, `opts` is instead reset back to the initial "empty" configuration, +// with all options disabled, as if it was freshly allocated. +// +// Note: This function will also reset structs which were not included in +// `preset`, such as any custom upscalers. +PL_API void pl_options_reset(pl_options opts, const struct pl_render_params *preset); + +typedef const struct pl_opt_data_t { + // Original options struct. + pl_options opts; + + // Triggering option for this callback invocation. + pl_opt opt; + + // The raw data associated with this option. Always some pointer into + // `opts`. Note that only PL_OPT_BOOL, PL_OPT_INT and PL_OPT_FLOAT have + // a fixed representation, for other fields its usefulness is dubious. + const void *value; + + // The underlying data, as a formatted, locale-invariant string. Lifetime + // is limited until the return of this callback. + const char *text; +} *pl_opt_data; + +// Query a single option from `opts` by key, or NULL if none was found. +// The resulting pointer is only valid until the next pl_options_* call. +PL_API pl_opt_data pl_options_get(pl_options opts, const char *key); + +// Update an option from a formatted value string (see `pl_opt_data.text`). +// This can be used for all type of options, even non-string ones. In this case, +// `value` will be parsed according to the option type. +// +// Returns whether successful. +PL_API bool pl_options_set_str(pl_options opts, const char *key, const char *value); + +// Programmatically iterate over options set in a `pl_options`, running the +// provided callback on each entry. +PL_API void pl_options_iterate(pl_options opts, + void (*cb)(void *priv, pl_opt_data data), + void *priv); + +// Serialize a `pl_options` structs to a comma-separated key/value string. The +// returned string has a lifetime valid until either the next call to +// `pl_options_save`, or until the `pl_options` is freed. +PL_API const char *pl_options_save(pl_options opts); + +// Parse a `pl_options` struct from a key/value string, in standard syntax +// "key1=value1,key2=value2,...", and updates `opts` with the new values. +// Valid separators include whitespace, commas (,) and (semi)colons (:;). +// +// Returns true if no errors occurred. +PL_API bool pl_options_load(pl_options opts, const char *str); + +// Helpers for interfacing with `opts->params.hooks`. Note that using any of +// these helpers will overwrite the array by an internally managed pointer, +// so care must be taken when combining them with external management of +// this memory. Negative indices are possible and are counted relative to the +// end of the list. +// +// Note: These hooks are *not* included in pl_options_save() and related. +PL_API void pl_options_add_hook(pl_options opts, const struct pl_hook *hook); +PL_API void pl_options_insert_hook(pl_options opts, const struct pl_hook *hook, int idx); +PL_API void pl_options_remove_hook_at(pl_options opts, int idx); + +// Underlying options system and list +// +// Note: By necessity, this option list does not cover every single field +// present in `pl_render_params`. In particular, fields like `info_callback`, +// `lut` and `hooks` cannot be configured through the options system, as doing +// so would require interop with C code or I/O. (However, see +// `pl_options_add_hook` and related) + +enum pl_option_type { + // Accepts `yes/no`, `on/off`, `true/false` and variants + PL_OPT_BOOL, + + // Parsed as human-readable locale-invariant (C) numbers, scientific + // notation accepted for floats + PL_OPT_INT, + PL_OPT_FLOAT, + + // Parsed as a short string containing only alphanumerics and _-, + // corresponding to some name/identifier. Catch-all bucket for several + // other types of options, such as presets, struct pointers, and functions + // + // Note: These options do not correspond to actual strings in C, the + // underlying type of option will determine the values of `size` and + // corresponding interpretation of pointers. + PL_OPT_STRING, + + PL_OPT_TYPE_COUNT, +}; + +struct pl_opt_t { + // Programmatic key uniquely identifying this option. + const char *key; + + // Longer, human readable friendly name + const char *name; + + // Data type of option, affects how it is parsed. This field is purely + // informative for the user, the actual implementation may vary. + enum pl_option_type type; + + // Minimum/maximum value ranges for numeric options (int / float) + // If both are 0.0, these limits are disabled/ignored. + float min, max; + + // If true, this option is considered deprecated and may be removed + // in the future. + bool deprecated; + + // If true, this option is considered a 'preset' (read-only), which can + // be loaded but not saved. (The equivalent underlying options this preset + // corresponds to will be saved instead) + bool preset; + + // Internal implementation details (for parsing/saving), opaque to user + const void *priv; +}; + +// A list of options, terminated by {0} for convenience +PL_API extern const struct pl_opt_t pl_option_list[]; +PL_API extern const int pl_option_count; // excluding terminating {0} + +// Returns the `pl_option` associated with a given key, or NULL +PL_API pl_opt pl_find_option(const char *key); + +PL_API_END + +#endif // LIBPLACEBO_OPTIONS_H_ diff --git a/src/include/libplacebo/renderer.h b/src/include/libplacebo/renderer.h new file mode 100644 index 0000000..d2e01e4 --- /dev/null +++ b/src/include/libplacebo/renderer.h @@ -0,0 +1,847 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_RENDERER_H_ +#define LIBPLACEBO_RENDERER_H_ + +#include <libplacebo/config.h> +#include <libplacebo/colorspace.h> +#include <libplacebo/filters.h> +#include <libplacebo/gpu.h> +#include <libplacebo/shaders/colorspace.h> +#include <libplacebo/shaders/deinterlacing.h> +#include <libplacebo/shaders/dithering.h> +#include <libplacebo/shaders/film_grain.h> +#include <libplacebo/shaders/icc.h> +#include <libplacebo/shaders/lut.h> +#include <libplacebo/shaders/sampling.h> +#include <libplacebo/shaders/custom.h> +#include <libplacebo/swapchain.h> + +PL_API_BEGIN + +// Thread-safety: Unsafe +typedef struct pl_renderer_t *pl_renderer; + +// Enum values used in pl_renderer_errors_t as a bit positions for error flags +enum pl_render_error { + PL_RENDER_ERR_NONE = 0, + PL_RENDER_ERR_FBO = 1 << 0, + PL_RENDER_ERR_SAMPLING = 1 << 1, + PL_RENDER_ERR_DEBANDING = 1 << 2, + PL_RENDER_ERR_BLENDING = 1 << 3, + PL_RENDER_ERR_OVERLAY = 1 << 4, + PL_RENDER_ERR_PEAK_DETECT = 1 << 5, + PL_RENDER_ERR_FILM_GRAIN = 1 << 6, + PL_RENDER_ERR_FRAME_MIXING = 1 << 7, + PL_RENDER_ERR_DEINTERLACING = 1 << 8, + PL_RENDER_ERR_ERROR_DIFFUSION = 1 << 9, + PL_RENDER_ERR_HOOKS = 1 << 10, + PL_RENDER_ERR_CONTRAST_RECOVERY = 1 << 11, +}; + +// Struct describing current renderer state, including internal processing errors, +// as well as list of signatures of disabled hooks. +struct pl_render_errors { + enum pl_render_error errors; + // List containing signatures of disabled hooks + const uint64_t *disabled_hooks; + int num_disabled_hooks; +}; + +// Creates a new renderer object, which is backed by a GPU context. This is a +// high-level object that takes care of the rendering chain as a whole, from +// the source textures to the finished frame. +PL_API pl_renderer pl_renderer_create(pl_log log, pl_gpu gpu); +PL_API void pl_renderer_destroy(pl_renderer *rr); + +// Returns current renderer state, see pl_render_errors. +PL_API struct pl_render_errors pl_renderer_get_errors(pl_renderer rr); + +// Clears errors state of renderer. If `errors` is NULL, all render errors will +// be cleared. Otherwise only selected errors/hooks will be cleared. +// If `PL_RENDER_ERR_HOOKS` is set and `num_disabled_hooks` is 0, clear all hooks. +// Otherwise only selected hooks will be cleard based on `disabled_hooks` array. +PL_API void pl_renderer_reset_errors(pl_renderer rr, + const struct pl_render_errors *errors); + +enum pl_lut_type { + PL_LUT_UNKNOWN = 0, + PL_LUT_NATIVE, // applied to raw image contents (after fixing bit depth) + PL_LUT_NORMALIZED, // applied to normalized (HDR) RGB values + PL_LUT_CONVERSION, // LUT fully replaces color conversion + + // Note: When using a PL_LUT_CONVERSION to replace the YUV->RGB conversion, + // `pl_render_params.color_adjustment` is no longer applied. Similarly, + // when using a PL_LUT_CONVERSION to replace the image->target color space + // conversion, `pl_render_params.color_map_params` are ignored. + // + // Note: For LUTs attached to the output frame, PL_LUT_CONVERSION should + // instead perform the inverse (RGB->native) conversion. + // + // Note: PL_LUT_UNKNOWN tries inferring the meaning of the LUT from the + // LUT's tagged metadata, and otherwise falls back to PL_LUT_NATIVE. +}; + +enum pl_render_stage { + PL_RENDER_STAGE_FRAME, // full frame redraws, for fresh/uncached frames + PL_RENDER_STAGE_BLEND, // the output blend pass (only for pl_render_image_mix) + PL_RENDER_STAGE_COUNT, +}; + +struct pl_render_info { + const struct pl_dispatch_info *pass; // information about the shader + enum pl_render_stage stage; // the associated render stage + + // This specifies the chronological index of this pass within the frame and + // stage (starting at `index == 0`). + int index; + + // For PL_RENDER_STAGE_BLEND, this specifies the number of frames + // being blended (since that results in a different shader). + int count; +}; + +// Represents the options used for rendering. These affect the quality of +// the result. +struct pl_render_params { + // Configures the algorithms used for upscaling and downscaling, + // respectively. If left as NULL, then libplacebo will only use inexpensive + // sampling (bilinear or nearest neighbour depending on the capabilities + // of the hardware / texture). + // + // Note: Setting `downscaler` to NULL also implies `skip_anti_aliasing`, + // since the built-in GPU sampling algorithms can't anti-alias. + // + // Note: If set to the same address as the built-in `pl_filter_bicubic`, + // `pl_filter_nearest` etc.; libplacebo will also use the more efficient + // direct sampling algorithm where possible without quality loss. + const struct pl_filter_config *upscaler; + const struct pl_filter_config *downscaler; + + // If set, this overrides the value of `upscaler`/`downscaling` for + // subsampled (chroma) planes. These scalers are used whenever the size of + // multiple different `pl_plane`s in a single `pl_frame` differ, requiring + // adaptation when converting to/from RGB. Note that a value of NULL simply + // means "no override". To force built-in scaling explicitly, set this to + // `&pl_filter_bilinear`. + const struct pl_filter_config *plane_upscaler; + const struct pl_filter_config *plane_downscaler; + + // The anti-ringing strength to apply to filters. See the equivalent option + // in `pl_sample_filter_params` for more information. + float antiringing_strength; + + // Configures the algorithm used for frame mixing (when using + // `pl_render_image_mix`). Ignored otherwise. As a special requirement, + // this must be a filter config with `polar` set to false, since it's only + // used for 1D mixing and thus only 1D filters are compatible. + // + // If set to NULL, frame mixing is disabled, in which case + // `pl_render_image_mix` will use nearest-neighbour semantics. (Note that + // this still goes through the redraw cache, unless you also enable + // `skip_caching_single_frame`) + const struct pl_filter_config *frame_mixer; + + // Configures the settings used to deband source textures. Leaving this as + // NULL disables debanding. + // + // Note: The `deband_params.grain` setting is automatically adjusted to + // prevent blowing up on HDR sources. The user need not account for this. + const struct pl_deband_params *deband_params; + + // Configures the settings used to sigmoidize the image before upscaling. + // This is not always used. If NULL, disables sigmoidization. + const struct pl_sigmoid_params *sigmoid_params; + + // Configures the color adjustment parameters used to decode the color. + // This can be used to apply additional artistic settings such as + // desaturation, etc. If NULL, defaults to &pl_color_adjustment_neutral. + const struct pl_color_adjustment *color_adjustment; + + // Configures the settings used to detect the peak of the source content, + // for HDR sources. Has no effect on SDR content. If NULL, peak detection + // is disabled. + const struct pl_peak_detect_params *peak_detect_params; + + // Configures the settings used to tone map from HDR to SDR, or from higher + // gamut to standard gamut content. If NULL, defaults to + // `&pl_color_map_default_params`. + const struct pl_color_map_params *color_map_params; + + // Configures the settings used to dither to the output depth. Leaving this + // as NULL disables dithering. + const struct pl_dither_params *dither_params; + + // Configures the error diffusion kernel to use for error diffusion + // dithering. If set, this will be used instead of `dither_params` whenever + // possible. Leaving this as NULL disables error diffusion. + const struct pl_error_diffusion_kernel *error_diffusion; + + // Configures the settings used to simulate color blindness, if desired. + // If NULL, this feature is disabled. + const struct pl_cone_params *cone_params; + + // Configures output blending. When rendering to the final target, the + // framebuffer contents will be blended using this blend mode. Requires + // that the target format has PL_FMT_CAP_BLENDABLE. NULL disables blending. + const struct pl_blend_params *blend_params; + + // Configures the settings used to deinterlace frames (see + // `pl_frame.field`), if required.. If NULL, deinterlacing is "disabled", + // meaning interlaced frames are rendered as weaved frames instead. + // + // Note: As a consequence of how `pl_frame` represents individual fields, + // and especially when using the `pl_queue`, this will still result in + // frames being redundantly rendered twice. As such, it's highly + // recommended to, instead, fully disable deinterlacing by not marking + // source frames as interlaced in the first place. + const struct pl_deinterlace_params *deinterlace_params; + + // If set, applies an extra distortion matrix to the image, after + // scaling and before presenting it to the screen. Can be used for e.g. + // fractional rotation. + // + // Note: The distortion canvas will be set to the size of `target->crop`, + // so this cannot effectively draw outside the specified target area, + // nor change the aspect ratio of the image. + const struct pl_distort_params *distort_params; + + // List of custom user shaders / hooks. + // See <libplacebo/shaders/custom.h> for more information. + const struct pl_hook * const *hooks; + int num_hooks; + + // Color mapping LUT. If present, this will be applied as part of the + // image being rendered, in normalized RGB space. + // + // Note: In this context, PL_LUT_NATIVE means "gamma light" and + // PL_LUT_NORMALIZED means "linear light". For HDR signals, normalized LUTs + // are scaled so 1.0 corresponds to the `pl_color_transfer_nominal_peak`. + // + // Note: A PL_LUT_CONVERSION fully replaces the color adaptation from + // `image` to `target`, including any tone-mapping (if necessary) and ICC + // profiles. It has the same representation as PL_LUT_NATIVE, so in this + // case the input and output are (respectively) non-linear light RGB. + const struct pl_custom_lut *lut; + enum pl_lut_type lut_type; + + // If the image being rendered does not span the entire size of the target, + // it will be cleared explicitly using this background color (RGB). To + // disable this logic, set `skip_target_clearing`. + float background_color[3]; + float background_transparency; // 0.0 for opaque, 1.0 for fully transparent + bool skip_target_clearing; + + // If set to a value above 0.0, the output will be rendered with rounded + // corners, as if an alpha transparency mask had been applied. The value + // indicates the relative fraction of the side length to round - a value + // of 1.0 rounds the corners as much as possible. + float corner_rounding; + + // If true, then transparent images will made opaque by painting them + // against a checkerboard pattern consisting of alternating colors. If both + // colors are left as {0}, they default respectively to 93% and 87% gray. + bool blend_against_tiles; + float tile_colors[2][3]; + int tile_size; + + // --- Performance / quality trade-off options: + // These should generally be left off where quality is desired, as they can + // degrade the result quite noticeably; but may be useful for older or + // slower hardware. Note that libplacebo will automatically disable + // advanced features on hardware where they are unsupported, regardless of + // these settings. So only enable them if you need a performance bump. + + // Disables anti-aliasing on downscaling. This will result in moiré + // artifacts and nasty, jagged pixels when downscaling, except for some + // very limited special cases (e.g. bilinear downsampling to exactly 0.5x). + // + // Significantly speeds up downscaling with high downscaling ratios. + bool skip_anti_aliasing; + + // Normally, when the size of the `target` used with `pl_render_image_mix` + // changes, or the render parameters are updated, the internal cache of + // mixed frames must be discarded in order to re-render all required + // frames. Setting this option to `true` will skip the cache invalidation + // and instead re-use the existing frames (with bilinear scaling to the new + // size if necessary), which comes at a quality loss shortly after a + // resize, but should make it much more smooth. + bool preserve_mixing_cache; + + // --- Performance tuning / debugging options + // These may affect performance or may make debugging problems easier, + // but shouldn't have any effect on the quality. + + // Normally, `pl_render_image_mix` will also push single frames through the + // mixer cache, in order to speed up re-draws. Enabling this option + // disables that logic, causing single frames to bypass the cache. (Though + // it will still read from, if they happen to already be cached) + bool skip_caching_single_frame; + + // Disables linearization / sigmoidization before scaling. This might be + // useful when tracking down unexpected image artifacts or excessing + // ringing, but it shouldn't normally be necessary. + bool disable_linear_scaling; + + // Forces the use of the "general" scaling algorithms even when using the + // special-cased built-in presets like `pl_filter_bicubic`. Basically, this + // disables the more efficient implementations in favor of the slower, + // general-purpose ones. + bool disable_builtin_scalers; + + // Forces correction of subpixel offsets (using the configured `upscaler`). + bool correct_subpixel_offsets; + + // Forces the use of dithering, even when rendering to 16-bit FBOs. This is + // generally pretty pointless because most 16-bit FBOs have high enough + // depth that rounding errors are below the human perception threshold, + // but this can be used to test the dither code. + bool force_dither; + + // Disables the gamma-correct dithering logic which normally applies when + // dithering to low bit depths. No real use, outside of testing. + bool disable_dither_gamma_correction; + + // Completely overrides the use of FBOs, as if there were no renderable + // texture format available. This disables most features. + bool disable_fbos; + + // Use only low-bit-depth FBOs (8 bits). Note that this also implies + // disabling linear scaling and sigmoidization. + bool force_low_bit_depth_fbos; + + // If this is true, all shaders will be generated as "dynamic" shaders, + // with any compile-time constants being replaced by runtime-adjustable + // values. This is generally a performance loss, but has the advantage of + // being able to freely change parameters without triggering shader + // recompilations. + // + // It's a good idea to enable while presenting configurable settings to the + // user, but it should be set to false once those values are "dialed in". + bool dynamic_constants; + + // This callback is invoked for every pass successfully executed in the + // process of rendering a frame. Optional. + // + // Note: `info` is only valid until this function returns. + void (*info_callback)(void *priv, const struct pl_render_info *info); + void *info_priv; + + // --- Deprecated/removed fields + bool allow_delayed_peak_detect PL_DEPRECATED; // moved to pl_peak_detect_params + const struct pl_icc_params *icc_params PL_DEPRECATED; // use pl_frame.icc + bool ignore_icc_profiles PL_DEPRECATED; // non-functional, just set pl_frame.icc to NULL + int lut_entries PL_DEPRECATED; // hard-coded as 256 + float polar_cutoff PL_DEPRECATED; // hard-coded as 1e-3 +}; + +// Bare minimum parameters, with no features enabled. This is the fastest +// possible configuration, and should therefore be fine on any system. +#define PL_RENDER_DEFAULTS \ + .color_map_params = &pl_color_map_default_params, \ + .color_adjustment = &pl_color_adjustment_neutral, \ + .tile_colors = {{0.93, 0.93, 0.93}, \ + {0.87, 0.87, 0.87}}, \ + .tile_size = 32, + +#define pl_render_params(...) (&(struct pl_render_params) { PL_RENDER_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_render_params pl_render_fast_params; + +// This contains the default/recommended options for reasonable image quality, +// while also not being too terribly slow. All of the *_params structs are +// defaulted to the corresponding *_default_params, except for deband_params, +// which is disabled by default. +// +// This should be fine on most integrated GPUs, but if it's too slow, +// consider using `pl_render_fast_params` instead. +PL_API extern const struct pl_render_params pl_render_default_params; + +// This contains a higher quality preset for better image quality at the cost +// of quite a bit of performance. In addition to the settings implied by +// `pl_render_default_params`, it enables debanding, sets the upscaler to +// `pl_filter_ewa_lanczossharp`, and uses pl_*_high_quality_params structs where +// available. This should only really be used with a discrete GPU and where +// maximum image quality is desired. +PL_API extern const struct pl_render_params pl_render_high_quality_params; + +#define PL_MAX_PLANES 4 + +// High level description of a single slice of an image. This basically +// represents a single 2D plane, with any number of components +struct pl_plane { + // The texture underlying this plane. The texture must be 2D, and must + // have specific parameters set depending on what the plane is being used + // for (see `pl_render_image`). + pl_tex texture; + + // The preferred behaviour when sampling outside of this texture. Optional, + // since the default (PL_TEX_ADDRESS_CLAMP) is very reasonable. + enum pl_tex_address_mode address_mode; + + // Controls whether or not the `texture` will be considered flipped + // vertically with respect to the overall image dimensions. It's generally + // preferable to flip planes using this setting instead of the crop in + // cases where the flipping is the result of e.g. negative plane strides or + // flipped framebuffers (OpenGL). + // + // Note that any planar padding (due to e.g. size mismatch or misalignment + // of subsampled planes) is always at the physical end of the texture + // (highest y coordinate) - even if this bool is true. However, any + // subsampling shift (`shift_y`) is applied with respect to the flipped + // direction. This ensures the correct interpretation when e.g. vertically + // flipping 4:2:0 sources by flipping all planes. + bool flipped; + + // Describes the number and interpretation of the components in this plane. + // This defines the mapping from component index to the canonical component + // order (RGBA, YCbCrA or XYZA). It's worth pointing out that this is + // completely separate from `texture->format.sample_order`. The latter is + // essentially irrelevant/transparent for the API user, since it just + // determines which order the texture data shows up as inside the GLSL + // shader; whereas this field controls the actual meaning of the component. + // + // Example; if the user has a plane with just {Y} and a plane with just + // {Cb Cr}, and a GPU that only supports bgra formats, you would still + // specify the component mapping as {0} and {1 2} respectively, even though + // the GPU is sampling the data in the order BGRA. Use -1 for "ignored" + // components. + int components; // number of relevant components + int component_mapping[4]; // semantic index of each component + + // Controls the sample offset, relative to the "reference" dimensions. For + // an example of what to set here, see `pl_chroma_location_offset`. Note + // that this is given in unit of reference pixels. For a graphical example, + // imagine you have a 2x2 image with a 1x1 (subsampled) plane. Without any + // shift (0.0), the situation looks like this: + // + // X-------X X = reference pixel + // | | P = plane pixel + // | P | + // | | + // X-------X + // + // For 4:2:0 subsampling, this corresponds to PL_CHROMA_CENTER. If the + // shift_x was instead set to -0.5, the `P` pixel would be offset to the + // left by half the separation between the reference (`X` pixels), resulting + // in the following: + // + // X-------X X = reference pixel + // | | P = plane pixel + // P | + // | | + // X-------X + // + // For 4:2:0 subsampling, this corresponds to PL_CHROMA_LEFT. + // + // Note: It's recommended to fill this using `pl_chroma_location_offset` on + // the chroma planes. + float shift_x, shift_y; +}; + +enum pl_overlay_mode { + PL_OVERLAY_NORMAL = 0, // treat the texture as a normal, full-color texture + PL_OVERLAY_MONOCHROME, // treat the texture as a single-component alpha map + PL_OVERLAY_MODE_COUNT, +}; + +enum pl_overlay_coords { + PL_OVERLAY_COORDS_AUTO = 0, // equal to SRC/DST_FRAME, respectively + PL_OVERLAY_COORDS_SRC_FRAME, // relative to the raw src frame + PL_OVERLAY_COORDS_SRC_CROP, // relative to the src frame crop + PL_OVERLAY_COORDS_DST_FRAME, // relative to the raw dst frame + PL_OVERLAY_COORDS_DST_CROP, // relative to the dst frame crop + PL_OVERLAY_COORDS_COUNT, + + // Note on rotations: If there is an end-to-end rotation between `src` and + // `dst`, then any overlays relative to SRC_FRAME or SRC_CROP will be + // rotated alongside the image, while overlays relative to DST_FRAME or + // DST_CROP will not. +}; + +struct pl_overlay_part { + pl_rect2df src; // source coordinate with respect to `pl_overlay.tex` + pl_rect2df dst; // target coordinates with respect to `pl_overlay.coords` + + // If `mode` is PL_OVERLAY_MONOCHROME, then this specifies the color of + // this overlay part. The color is multiplied into the sampled texture's + // first channel. + float color[4]; +}; + +// A struct representing an image overlay (e.g. for subtitles or on-screen +// status messages, controls, ...) +struct pl_overlay { + // The texture containing the backing data for overlay parts. Must have + // `params.sampleable` set. + pl_tex tex; + + // This controls the coloring mode of this overlay. + enum pl_overlay_mode mode; + + // Controls which coordinates this overlay is addressed relative to. + enum pl_overlay_coords coords; + + // This controls the colorspace information for this overlay. The contents + // of the texture / the value of `color` are interpreted according to this. + struct pl_color_repr repr; + struct pl_color_space color; + + // The number of parts for this overlay. + const struct pl_overlay_part *parts; + int num_parts; +}; + +// High-level description of a complete frame, including metadata and planes +struct pl_frame { + // Each frame is split up into some number of planes, each of which may + // carry several components and be of any size / offset. + int num_planes; + struct pl_plane planes[PL_MAX_PLANES]; + + // For interlaced frames. If set, this `pl_frame` corresponds to a single + // field of the underlying source textures. `first_field` indicates which + // of these fields is ordered first in time. `prev` and `next` should point + // to the previous/next frames in the file, or NULL if there are none. + // + // Note: Setting these fields on the render target has no meaning and will + // be ignored. + enum pl_field field; + enum pl_field first_field; + const struct pl_frame *prev, *next; + + // If set, will be called immediately before GPU access to this frame. This + // function *may* be used to, for example, perform synchronization with + // external APIs (e.g. `pl_vulkan_hold/release`). If your mapping requires + // a memcpy of some sort (e.g. pl_tex_transfer), users *should* instead do + // the memcpy up-front and avoid the use of these callbacks - because they + // might be called multiple times on the same frame. + // + // This function *may* arbitrarily mutate the `pl_frame`, but it *should* + // ideally only update `planes` - in particular, color metadata and so + // forth should be provided up-front as best as possible. Note that changes + // here will not be reflected back to the structs provided in the original + // `pl_render_*` call (e.g. via `pl_frame_mix`). + // + // Note: Unless dealing with interlaced frames, only one frame will ever be + // acquired at a time per `pl_render_*` call. So users *can* safely use + // this with, for example, hwdec mappers that can only map a single frame + // at a time. When using this with, for example, `pl_render_image_mix`, + // each frame to be blended is acquired and release in succession, before + // moving on to the next frame. For interlaced frames, the previous and + // next frames must also be acquired simultaneously. + bool (*acquire)(pl_gpu gpu, struct pl_frame *frame); + + // If set, will be called after a plane is done being used by the GPU, + // *including* after any errors (e.g. `acquire` returning false). + void (*release)(pl_gpu gpu, struct pl_frame *frame); + + // Color representation / encoding / semantics of this frame. + struct pl_color_repr repr; + struct pl_color_space color; + + // Optional ICC profile associated with this frame. + pl_icc_object icc; + + // Alternative to `icc`, this can be used in cases where allocating and + // tracking an pl_icc_object externally may be inconvenient. The resulting + // profile will be managed internally by the pl_renderer. + struct pl_icc_profile profile; + + // Optional LUT associated with this frame. + const struct pl_custom_lut *lut; + enum pl_lut_type lut_type; + + // The logical crop / rectangle containing the valid information, relative + // to the reference plane's dimensions (e.g. luma). Pixels outside of this + // rectangle will ostensibly be ignored, but note that this is not a hard + // guarantee. In particular, scaler filters may end up sampling outside of + // this crop. This rect may be flipped, and may be partially or wholly + // outside the bounds of the underlying textures. (Optional) + // + // Note that `pl_render_image` will map the input crop directly to the + // output crop, stretching and scaling as needed. If you wish to preserve + // the aspect ratio, use a dedicated function like pl_rect2df_aspect_copy. + pl_rect2df crop; + + // Logical rotation of the image, with respect to the underlying planes. + // For example, if this is PL_ROTATION_90, then the image will be rotated + // to the right by 90° when mapping to `crop`. The actual position on-screen + // is unaffected, so users should ensure that the (rotated) aspect ratio + // matches the source. (Or use a helper like `pl_rect2df_aspect_set_rot`) + // + // Note: For `target` frames, this corresponds to a rotation of the + // display, for `image` frames, this corresponds to a rotation of the + // camera. + // + // So, as an example, target->rotation = PL_ROTATE_90 means the end user + // has rotated the display to the right by 90° (meaning rendering will be + // rotated 90° to the *left* to compensate), and image->rotation = + // PL_ROTATE_90 means the video provider has rotated the camera to the + // right by 90° (so rendering will be rotated 90° to the *right* to + // compensate). + pl_rotation rotation; + + // A list of additional overlays associated with this frame. Note that will + // be rendered directly onto intermediate/cache frames, so changing any of + // these overlays may require flushing the renderer cache. + const struct pl_overlay *overlays; + int num_overlays; + + // Note on subsampling and plane correspondence: All planes belonging to + // the same frame will only be stretched by an integer multiple (or inverse + // thereof) in order to match the reference dimensions of this image. For + // example, suppose you have an 8x4 image. A valid plane scaling would be + // 4x2 -> 8x4 or 4x4 -> 4x4, but not 6x4 -> 8x4. So if a 6x4 plane is + // given, then it would be treated like a cropped 8x4 plane (since 1.0 is + // the closest scaling ratio to the actual ratio of 1.3). + // + // For an explanation of why this makes sense, consider the relatively + // common example of a subsampled, oddly sized (e.g. jpeg) image. In such + // cases, for example a 35x23 image, the 4:2:0 subsampled chroma plane + // would have to end up as 17.5x11.5, which gets rounded up to 18x12 by + // implementations. So in this example, the 18x12 chroma plane would get + // treated by libplacebo as an oversized chroma plane - i.e. the plane + // would get sampled as if it was 17.5 pixels wide and 11.5 pixels large. + + // Associated film grain data (see <libplacebo/shaders/film_grain.h>). + // + // Note: This is ignored for the `target` of `pl_render_image`, since + // un-applying grain makes little sense. + struct pl_film_grain_data film_grain; + + // Ignored by libplacebo. May be useful for users. + void *user_data; +}; + +// Helper function to infer the chroma location offset for each plane in a +// frame. This is equivalent to calling `pl_chroma_location_offset` on all +// subsampled planes' shift_x/shift_y variables. +PL_API void pl_frame_set_chroma_location(struct pl_frame *frame, + enum pl_chroma_location chroma_loc); + +// Fills in a `pl_frame` based on a swapchain frame's FBO and metadata. +PL_API void pl_frame_from_swapchain(struct pl_frame *out_frame, + const struct pl_swapchain_frame *frame); + +// Helper function to determine if a frame is logically cropped or not. In +// particular, this is useful in determining whether or not an output frame +// needs to be cleared before rendering or not. +PL_API bool pl_frame_is_cropped(const struct pl_frame *frame); + +// Helper function to reset a frame to a given RGB color. If the frame's +// color representation is something other than RGB, the clear color will +// be adjusted accordingly. `clear_color` should be non-premultiplied. +PL_API void pl_frame_clear_rgba(pl_gpu gpu, const struct pl_frame *frame, + const float clear_color[4]); + +// Like `pl_frame_clear_rgba` but without an alpha channel. +static inline void pl_frame_clear(pl_gpu gpu, const struct pl_frame *frame, + const float clear_color[3]) +{ + const float clear_color_rgba[4] = { clear_color[0], clear_color[1], clear_color[2], 1.0 }; + pl_frame_clear_rgba(gpu, frame, clear_color_rgba); +} + +// Helper functions to return the fixed/inferred pl_frame parameters used +// for rendering internally. Mutates `image` and `target` in-place to hold +// the modified values, which are what will actually be used for rendering. +// +// This currently includes: +// - Defaulting all missing pl_color_space/repr parameters +// - Coalescing all rotation to the target +// - Rounding and clamping the target crop to pixel boundaries and adjusting the +// image crop correspondingly +// +// Note: This is idempotent and does not generally alter the effects of a +// subsequent `pl_render_image` on the same pl_frame pair. (But see the +// following warning) +// +// Warning: This does *not* call pl_frame.acquire/release, and so the returned +// metadata *may* be incorrect if the acquire callback mutates the pl_frame in +// nontrivial ways, in particular the crop and color space fields. +PL_API void pl_frames_infer(pl_renderer rr, struct pl_frame *image, + struct pl_frame *target); + + +// Render a single image to a target using the given parameters. This is +// fully dynamic, i.e. the params can change at any time. libplacebo will +// internally detect and flush whatever caches are invalidated as a result of +// changing colorspace, size etc. +// +// Required plane capabilities: +// - Planes in `image` must be `sampleable` +// - Planes in `target` must be `renderable` +// +// Recommended plane capabilities: (Optional, but good for performance) +// - Planes in `image` should have `sample_mode` PL_TEX_SAMPLE_LINEAR +// - Planes in `target` should be `storable` +// - Planes in `target` should have `blit_dst` +// +// Note on lifetime: Once this call returns, the passed structures may be +// freely overwritten or discarded by the caller, even the referenced +// `pl_tex` objects may be freely reused. +// +// Note: `image` may be NULL, in which case `target.overlays` will still be +// rendered, but nothing else. +PL_API bool pl_render_image(pl_renderer rr, const struct pl_frame *image, + const struct pl_frame *target, + const struct pl_render_params *params); + +// Flushes the internal state of this renderer. This is normally not needed, +// even if the image parameters, colorspace or target configuration change, +// since libplacebo will internally detect such circumstances and recreate +// outdated resources automatically. Doing this explicitly *may* be useful to +// purge some state related to things like HDR peak detection or frame mixing, +// so calling it is a good idea if the content source is expected to change +// dramatically (e.g. when switching to a different file). +PL_API void pl_renderer_flush_cache(pl_renderer rr); + +// Mirrors `pl_get_detected_hdr_metadata`, giving you the current internal peak +// detection HDR metadata (when peak detection is active). Returns false if no +// information is available (e.g. not HDR source, peak detection disabled). +PL_API bool pl_renderer_get_hdr_metadata(pl_renderer rr, + struct pl_hdr_metadata *metadata); + +// Represents a mixture of input frames, distributed temporally. +// +// NOTE: Frames must be sorted by timestamp, i.e. `timestamps` must be +// monotonically increasing. +struct pl_frame_mix { + // The number of frames in this mixture. The number of frames should be + // sufficient to meet the needs of the configured frame mixer. See the + // section below for more information. + // + // If the number of frames is 0, this call will be equivalent to + // `pl_render_image` with `image == NULL`. + int num_frames; + + // A list of the frames themselves. The frames can have different + // colorspaces, configurations of planes, or even sizes. + // + // Note: This is a list of pointers, to avoid users having to copy + // around `pl_frame` structs when re-organizing this array. + const struct pl_frame **frames; + + // A list of unique signatures, one for each frame. These are used to + // identify frames across calls to this function, so it's crucial that they + // be both unique per-frame but also stable across invocations of + // `pl_render_frame_mix`. + const uint64_t *signatures; + + // A list of relative timestamps for each frame. These are relative to the + // time of the vsync being drawn, i.e. this function will render the frame + // that will be made visible at timestamp 0.0. The values are expected to + // be normalized such that a separation of 1.0 corresponds to roughly one + // nominal source frame duration. So a constant framerate video file will + // always have timestamps like e.g. {-2.3, -1.3, -0.3, 0.7, 1.7, 2.7}, + // using an example radius of 3. + // + // In cases where the framerate is variable (e.g. VFR video), the choice of + // what to scale to use can be difficult to answer. A typical choice would + // be either to use the canonical (container-tagged) framerate, or the + // highest momentary framerate, as a reference. If all else fails, you + // could also use the display's framerate. + // + // Note: This function assumes zero-order-hold semantics, i.e. the frame at + // timestamp 0.7 is intended to remain visible until timestamp 1.7, when + // the next frame replaces it. + const float *timestamps; + + // The duration for which the vsync being drawn will be held, using the + // same scale as `timestamps`. If the display has an unknown or variable + // frame-rate (e.g. Adaptive Sync), then you're probably better off not + // using this function and instead just painting the frames directly using + // `pl_render_frame` at the correct PTS. + // + // As an example, if `vsync_duration` is 0.4, then it's assumed that the + // vsync being painted is visible for the period [0.0, 0.4]. + float vsync_duration; + + // Explanation of the frame mixing radius: The algorithm chosen in + // `pl_render_params.frame_mixer` has a canonical radius equal to + // `pl_filter_config.kernel->radius`. This means that the frame mixing + // algorithm will (only) need to consult all of the frames that have a + // distance within the interval [-radius, radius]. As such, the user should + // include all such frames in `frames`, but may prune or omit frames that + // lie outside it. + // + // The built-in frame mixing (`pl_render_params.frame_mixer == NULL`) has + // no concept of radius, it just always needs access to the "current" and + // "next" frames. +}; + +// Helper function to calculate the base frame mixing radius. +// +// Note: When the source FPS exceeds the display FPS, this radius must be +// increased by the corresponding ratio. +static inline float pl_frame_mix_radius(const struct pl_render_params *params) +{ + // For backwards compatibility, allow !frame_mixer->kernel + if (!params->frame_mixer || !params->frame_mixer->kernel) + return 0.0; + + return params->frame_mixer->kernel->radius; +} + +// Find closest frame to current PTS by zero-order hold semantics, or NULL. +PL_API const struct pl_frame *pl_frame_mix_current(const struct pl_frame_mix *mix); + +// Find closest frame to current PTS by nearest neighbour semantics, or NULL. +PL_API const struct pl_frame *pl_frame_mix_nearest(const struct pl_frame_mix *mix); + +// Render a mixture of images to the target using the given parameters. This +// functions much like a generalization of `pl_render_image`, for when the API +// user has more control over the frame queue / vsync loop, and can provide a +// few frames from the past and future + timestamp information. +// +// This allows libplacebo to perform rudimentary frame mixing / interpolation, +// in order to eliminate judder artifacts typically associated with +// source/display frame rate mismatch. +PL_API bool pl_render_image_mix(pl_renderer rr, const struct pl_frame_mix *images, + const struct pl_frame *target, + const struct pl_render_params *params); + +// Analog of `pl_frame_infer` corresponding to `pl_render_image_mix`. This +// function will *not* mutate the frames contained in `mix`, and instead +// return an adjusted copy of the "reference" frame for that image mix in +// `out_refimage`, or {0} if the mix is empty. +PL_API void pl_frames_infer_mix(pl_renderer rr, const struct pl_frame_mix *mix, + struct pl_frame *target, struct pl_frame *out_ref); + +// Backwards compatibility with old filters API, may be deprecated. +// Redundant with pl_filter_configs and masking `allowed` for +// PL_FILTER_SCALING and PL_FILTER_FRAME_MIXING respectively. + +// A list of recommended frame mixer presets, terminated by {0} +PL_API extern const struct pl_filter_preset pl_frame_mixers[]; +PL_API extern const int pl_num_frame_mixers; // excluding trailing {0} + +// A list of recommended scaler presets, terminated by {0}. This is almost +// equivalent to `pl_filter_presets` with the exception of including extra +// built-in filters that don't map to the `pl_filter` architecture. +PL_API extern const struct pl_filter_preset pl_scale_filters[]; +PL_API extern const int pl_num_scale_filters; // excluding trailing {0} + +// Deprecated in favor of `pl_cache_save/pl_cache_load` on the `pl_cache` +// associated with the `pl_gpu` this renderer is using. +PL_DEPRECATED PL_API size_t pl_renderer_save(pl_renderer rr, uint8_t *out_cache); +PL_DEPRECATED PL_API void pl_renderer_load(pl_renderer rr, const uint8_t *cache); + +PL_API_END + +#endif // LIBPLACEBO_RENDERER_H_ diff --git a/src/include/libplacebo/shaders.h b/src/include/libplacebo/shaders.h new file mode 100644 index 0000000..b8046be --- /dev/null +++ b/src/include/libplacebo/shaders.h @@ -0,0 +1,273 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SHADERS_H_ +#define LIBPLACEBO_SHADERS_H_ + +// This function defines the "direct" interface to libplacebo's GLSL shaders, +// suitable for use in contexts where the user controls GLSL shader compilation +// but wishes to include functions generated by libplacebo as part of their +// own rendering process. This API is normally not used for operation with +// libplacebo's higher-level constructs such as `pl_dispatch` or `pl_renderer`. + +#include <libplacebo/gpu.h> + +PL_API_BEGIN + +// Thread-safety: Unsafe +typedef struct pl_shader_t *pl_shader; + +struct pl_shader_params { + // The `id` represents an abstract identifier for the shader, to avoid + // collisions with other shaders being used as part of the same larger, + // overarching shader. This is relevant for users which want to combine + // multiple `pl_shader` objects together, in which case all `pl_shader` + // objects should have a unique `id`. + uint8_t id; + + // If `gpu` is non-NULL, then this `gpu` will be used to create objects + // such as textures and buffers, or check for required capabilities, for + // operations which depend on either of those. This is fully optional, i.e. + // these GLSL primitives are designed to be used without a dependency on + // `gpu` wherever possible - however, some features may not work, and will + // be disabled even if requested. + pl_gpu gpu; + + // The `index` represents an abstract frame index, which shaders may use + // internally to do things like temporal dithering or seeding PRNGs. If the + // user does not care about temporal dithering/debanding, or wants + // deterministic rendering, this may safely be left as 0. Otherwise, it + // should be incremented by 1 on successive frames. + uint8_t index; + + // If `glsl.version` is nonzero, then this structure will be used to + // determine the effective GLSL mode and capabilities. If `gpu` is also + // set, then this overrides `gpu->glsl`. + struct pl_glsl_version glsl; + + // If this is true, all constants in the shader will be replaced by + // dynamic variables. This is mainly useful to avoid recompilation for + // shaders which expect to have their values change constantly. + bool dynamic_constants; +}; + +#define pl_shader_params(...) (&(struct pl_shader_params) { __VA_ARGS__ }) + +// Creates a new, blank, mutable pl_shader object. +// +// Note: Rather than allocating and destroying many shaders, users are +// encouraged to reuse them (using `pl_shader_reset`) for efficiency. +PL_API pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params); + +// Frees a pl_shader and all resources associated with it. +PL_API void pl_shader_free(pl_shader *sh); + +// Resets a pl_shader to a blank slate, without releasing internal memory. +// If you're going to be re-generating shaders often, this function will let +// you skip the re-allocation overhead. +PL_API void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params); + +// Returns whether or not a shader is in a "failed" state. Trying to modify a +// shader in illegal ways (e.g. signature mismatch) will result in the shader +// being marked as "failed". Since most pl_shader_ operations have a void +// return type, the user can use this function to figure out whether a specific +// shader operation has failed or not. This function is somewhat redundant +// since `pl_shader_finalize` will also return NULL in this case. +PL_API bool pl_shader_is_failed(const pl_shader sh); + +// Returns whether or not a pl_shader needs to be run as a compute shader. This +// will never be the case unless the `pl_glsl_version` this `pl_shader` was +// created using has `compute` support enabled. +PL_API bool pl_shader_is_compute(const pl_shader sh); + +// Returns whether or not the shader has any particular output size +// requirements. Some shaders, in particular those that sample from other +// textures, have specific output size requirements which need to be respected +// by the caller. If this is false, then the shader is compatible with every +// output size. If true, the size requirements are stored into *w and *h. +PL_API bool pl_shader_output_size(const pl_shader sh, int *w, int *h); + +// Indicates the type of signature that is associated with a shader result. +// Every shader result defines a function that may be called by the user, and +// this enum indicates the type of value that this function takes and/or +// returns. +// +// Which signature a shader ends up with depends on the type of operation being +// performed by a shader fragment, as determined by the user's calls. See below +// for more information. +enum pl_shader_sig { + PL_SHADER_SIG_NONE = 0, // no input / void output + PL_SHADER_SIG_COLOR, // vec4 color (normalized so that 1.0 is the ref white) + + // The following are only valid as input signatures: + PL_SHADER_SIG_SAMPLER, // (gsampler* src_tex, vecN tex_coord) pair, + // specifics depend on how the shader was generated +}; + +// Structure encapsulating information about a shader. This is internally +// refcounted, to allow moving it around without having to create deep copies. +typedef const struct pl_shader_info_t { + // A copy of the parameters used to create the shader. + struct pl_shader_params params; + + // A list of friendly names for the semantic operations being performed by + // this shader, e.g. "color decoding" or "debanding". + const char **steps; + int num_steps; + + // As a convenience, this contains a pretty-printed version of the + // above list, with entries tallied and separated by commas + const char *description; +} *pl_shader_info; + +PL_API pl_shader_info pl_shader_info_ref(pl_shader_info info); +PL_API void pl_shader_info_deref(pl_shader_info *info); + +// Represents a finalized shader fragment. This is not a complete shader, but a +// collection of raw shader text together with description of the input +// attributes, variables and vertices it expects to be available. +struct pl_shader_res { + // Descriptive information about the shader. Note that this reference is + // attached to the shader itself - the user does not need to manually ref + // or deref `info` unless they wish to move it elsewhere. + pl_shader_info info; + + // The shader text, as literal GLSL. This will always be a function + // definition, such that the the function with the indicated name and + // signature may be called by the user. + const char *glsl; + const char *name; + enum pl_shader_sig input; // what the function expects + enum pl_shader_sig output; // what the function returns + + // For compute shaders (pl_shader_is_compute), this indicates the requested + // work group size. Otherwise, both fields are 0. The interpretation of + // these work groups is that they're tiled across the output image. + int compute_group_size[2]; + + // If this pass is a compute shader, this field indicates the shared memory + // size requirements for this shader pass. + size_t compute_shmem; + + // A set of input vertex attributes needed by this shader fragment. + const struct pl_shader_va *vertex_attribs; + int num_vertex_attribs; + + // A set of input variables needed by this shader fragment. + const struct pl_shader_var *variables; + int num_variables; + + // A list of input descriptors needed by this shader fragment, + const struct pl_shader_desc *descriptors; + int num_descriptors; + + // A list of compile-time constants used by this shader fragment. + const struct pl_shader_const *constants; + int num_constants; + + // --- Deprecated fields (see `info`) + struct pl_shader_params params PL_DEPRECATED; + const char **steps PL_DEPRECATED; + int num_steps PL_DEPRECATED; + const char *description PL_DEPRECATED; +}; + +// Represents a vertex attribute. The four values will be bound to the four +// corner vertices respectively, in row-wise order starting from the top left: +// data[0] data[1] +// data[2] data[3] +struct pl_shader_va { + struct pl_vertex_attrib attr; // VA type, excluding `offset` and `location` + const void *data[4]; +}; + +// Represents a bound shared variable / descriptor +struct pl_shader_var { + struct pl_var var; // the underlying variable description + const void *data; // the raw data (as per `pl_var_host_layout`) + bool dynamic; // if true, the value is expected to change frequently +}; + +struct pl_buffer_var { + struct pl_var var; + struct pl_var_layout layout; +}; + +typedef uint16_t pl_memory_qualifiers; +enum { + PL_MEMORY_COHERENT = 1 << 0, // supports synchronization across shader invocations + PL_MEMORY_VOLATILE = 1 << 1, // all writes are synchronized automatically + + // Note: All descriptors are also implicitly assumed to have the 'restrict' + // memory qualifier. There is currently no way to override this behavior. +}; + +struct pl_shader_desc { + struct pl_desc desc; // descriptor type, excluding `int binding` + struct pl_desc_binding binding; // contents of the descriptor binding + + // For PL_DESC_BUF_UNIFORM/STORAGE, this specifies the layout of the + // variables contained by a buffer. Ignored for the other descriptor types + struct pl_buffer_var *buffer_vars; + int num_buffer_vars; + + // For storage images and buffers, this specifies additional memory + // qualifiers on the descriptor. It's highly recommended to always use + // at least PL_MEMORY_RESTRICT. Ignored for other descriptor types. + pl_memory_qualifiers memory; +}; + +// Represents a compile-time constant. This can be lowered to a specialization +// constant to support cheaper recompilations. +struct pl_shader_const { + enum pl_var_type type; + const char *name; + const void *data; + + // If true, this constant *must* be a compile-time constant, which + // basically just overrides `pl_shader_params.dynamic_constants`. Useful + // for constants which will serve as inputs to e.g. array sizes. + bool compile_time; +}; + +// Finalize a pl_shader. It is no longer mutable at this point, and any further +// attempts to modify it result in an error. (Functions which take a `const +// pl_shader` argument do not modify the shader and may be freely +// called on an already-finalized shader) +// +// The returned pl_shader_res is bound to the lifetime of the pl_shader - and +// will only remain valid until the pl_shader is freed or reset. This function +// may be called multiple times, and will produce the same result each time. +// +// This function will return NULL if the shader is considered to be in a +// "failed" state (see pl_shader_is_failed). +PL_API const struct pl_shader_res *pl_shader_finalize(pl_shader sh); + +// Shader objects represent abstract resources that shaders need to manage in +// order to ensure their operation. This could include shader storage buffers, +// generated lookup textures, or other sorts of configured state. The body +// of a shader object is fully opaque; but the user is in charge of cleaning up +// after them and passing them to the right shader passes. +// +// Note: pl_shader_obj objects must be initialized to NULL by the caller. +typedef struct pl_shader_obj_t *pl_shader_obj; + +PL_API void pl_shader_obj_destroy(pl_shader_obj *obj); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_H_ diff --git a/src/include/libplacebo/shaders/colorspace.h b/src/include/libplacebo/shaders/colorspace.h new file mode 100644 index 0000000..ead0958 --- /dev/null +++ b/src/include/libplacebo/shaders/colorspace.h @@ -0,0 +1,381 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SHADERS_COLORSPACE_H_ +#define LIBPLACEBO_SHADERS_COLORSPACE_H_ + +// Color space transformation shaders. These all input and output a color +// value (PL_SHADER_SIG_COLOR). + +#include <libplacebo/colorspace.h> +#include <libplacebo/gamut_mapping.h> +#include <libplacebo/tone_mapping.h> +#include <libplacebo/shaders.h> + +// For backwards compatibility +#include <libplacebo/shaders/dithering.h> + +PL_API_BEGIN + +// Transform the input color, in its given representation, to ensure +// compatibility with the indicated alpha mode. Mutates `repr` to reflect the +// change. Note that this is a no-op if the input is PL_ALPHA_UNKNOWN. +PL_API void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr, + enum pl_alpha_mode mode); + +// Colorspace reshaping for PL_COLOR_SYSTEM_DOLBYVISION. Note that this is done +// automatically by `pl_shader_decode_color` for PL_COLOR_SYSTEM_DOLBYVISION. +PL_API void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data); + +// Decode the color into normalized RGB, given a specified color_repr. This +// also takes care of additional pre- and post-conversions requires for the +// "special" color systems (XYZ, BT.2020-C, etc.). If `params` is left as NULL, +// it defaults to &pl_color_adjustment_neutral. +// +// Note: This function always returns PC-range RGB with independent alpha. +// It mutates the pl_color_repr to reflect the change. +// +// Note: For DCDM XYZ decoding output is linear +PL_API void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr, + const struct pl_color_adjustment *params); + +// Encodes a color from normalized, PC-range, independent alpha RGB into a +// given representation. That is, this performs the inverse operation of +// `pl_shader_decode_color` (sans color adjustments). +// +// Note: For DCDM XYZ encoding input is expected to be linear +PL_API void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr); + +// Linearize (expand) `vec4 color`, given a specified color space. In essence, +// this corresponds to the ITU-R EOTF. +// +// Note: Unlike the ITU-R EOTF, it never includes the OOTF - even for systems +// where the EOTF includes the OOTF (such as HLG). +PL_API void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp); + +// Delinearize (compress), given a color space as output. This loosely +// corresponds to the inverse EOTF (not the OETF) in ITU-R terminology, again +// assuming a reference monitor. +PL_API void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp); + +struct pl_sigmoid_params { + // The center (bias) of the sigmoid curve. Must be between 0.0 and 1.0. + // If left as NULL, defaults to 0.75 + float center; + + // The slope (steepness) of the sigmoid curve. Must be between 1.0 and 20.0. + // If left as NULL, defaults to 6.5. + float slope; +}; + +#define PL_SIGMOID_DEFAULTS \ + .center = 0.75, \ + .slope = 6.50, + +#define pl_sigmoid_params(...) (&(struct pl_sigmoid_params) { PL_SIGMOID_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_sigmoid_params pl_sigmoid_default_params; + +// Applies a sigmoidal color transform to all channels. This helps avoid +// ringing artifacts during upscaling by bringing the color information closer +// to neutral and away from the extremes. If `params` is NULL, it defaults to +// &pl_sigmoid_default_params. +// +// Warning: This function clamps the input to the interval [0,1]; and as such +// it should *NOT* be used on already-decoded high-dynamic range content. +PL_API void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params); + +// This performs the inverse operation to `pl_shader_sigmoidize`. +PL_API void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params); + +struct pl_peak_detect_params { + // Smoothing coefficient for the detected values. This controls the time + // parameter (tau) of an IIR low pass filter. In other words, it represent + // the cutoff period (= 1 / cutoff frequency) in frames. Frequencies below + // this length will be suppressed. This helps block out annoying + // "sparkling" or "flickering" due to small variations in frame-to-frame + // brightness. If left as 0.0, this smoothing is completely disabled. + float smoothing_period; + + // In order to avoid reacting sluggishly on scene changes as a result of + // the low-pass filter, we disable it when the difference between the + // current frame brightness and the average frame brightness exceeds a + // given threshold difference. But rather than a single hard cutoff, which + // would lead to weird discontinuities on fades, we gradually disable it + // over a small window of brightness ranges. These parameters control the + // lower and upper bounds of this window, in units of 1% PQ. + // + // Setting either one of these to 0.0 disables this logic. + float scene_threshold_low; + float scene_threshold_high; + + // Which percentile of the input image brightness histogram to consider as + // the true peak of the scene. If this is set to 100 (or 0), the brightest + // pixel is measured. Otherwise, the top of the frequency distribution is + // progressively cut off. Setting this too low will cause clipping of very + // bright details, but can improve the dynamic brightness range of scenes + // with very bright isolated highlights. + // + // A recommended value is 99.995%, which is very conservative and should + // cause no major issues in typical content. + float percentile; + + // Allows the peak detection result to be delayed by up to a single frame, + // which can sometimes improve thoughput, at the cost of introducing the + // possibility of 1-frame flickers on transitions. Disabled by default. + bool allow_delayed; + + // --- Deprecated / removed fields + float overshoot_margin PL_DEPRECATED; + float minimum_peak PL_DEPRECATED; +}; + +#define PL_PEAK_DETECT_DEFAULTS \ + .smoothing_period = 20.0f, \ + .scene_threshold_low = 1.0f, \ + .scene_threshold_high = 3.0f, \ + .percentile = 100.0f, + +#define PL_PEAK_DETECT_HQ_DEFAULTS \ + PL_PEAK_DETECT_DEFAULTS \ + .percentile = 99.995f, + +#define pl_peak_detect_params(...) (&(struct pl_peak_detect_params) { PL_PEAK_DETECT_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_peak_detect_params pl_peak_detect_default_params; +PL_API extern const struct pl_peak_detect_params pl_peak_detect_high_quality_params; + +// This function can be used to measure the CLL and FALL of a video +// source automatically, using a compute shader. The measured values are +// smoothed automatically (depending on the parameters), so to keep track of +// the measured results over time, a tone mapping shader state object is used +// to hold the state. Returns false on failure initializing the tone mapping +// object, or if compute shaders are not supported. +// +// It's important that the same shader object is used for successive frames +// belonging to the same source. If the source changes (e.g. due to a file +// change or seek), the user should reset it with `pl_reset_detected_peak` (or +// destroy it and use a new state object). +// +// The parameter `csp` holds the representation of the color values that are +// the input to this function. (They must already be in decoded RGB form, i.e. +// alternate color representations are not supported) +PL_API bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp, + pl_shader_obj *state, + const struct pl_peak_detect_params *params); + +// After dispatching the above shader, this function can be used to retrieve +// the detected dynamic HDR10+ metadata parameters. The other fields of +// `metadata` are not written to. Returns whether or not any values were +// written. If not, the values are left untouched, so this can be used to +// safely update `pl_hdr_metadata` values in-place. This function may or may +// not block, depending on the previous setting of `allow_delayed`. +PL_API bool pl_get_detected_hdr_metadata(const pl_shader_obj state, + struct pl_hdr_metadata *metadata); + +// After dispatching the above shader, this function *may* be used to read out +// the detected CLL and FALL directly (in PL_HDR_NORM units). If the shader +// has never been dispatched yet, i.e. no information is available, this will +// return false. +// +// Deprecated in favor of `pl_get_detected_hdr_metadata` +PL_DEPRECATED PL_API bool pl_get_detected_peak(const pl_shader_obj state, + float *out_cll, float *out_fall); + +// Resets the peak detection state in a given tone mapping state object. This +// is not equal to `pl_shader_obj_destroy`, because it does not destroy any +// state used by `pl_shader_tone_map`. +PL_API void pl_reset_detected_peak(pl_shader_obj state); + +// Feature map extraction (for pl_color_map_args.feature_map). The result +// of this shader should be downscaled / low-passed to the indicated kernel +// size before use. (This does not happen automatically) +PL_API void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp); + +// Deprecated and unused. Libplacebo now always performs a variant of the old +// hybrid tone-mapping, mixing together the intensity (I) and per-channel (LMS) +// results. +enum pl_tone_map_mode { + PL_TONE_MAP_AUTO PL_DEPRECATED_ENUMERATOR, + PL_TONE_MAP_RGB PL_DEPRECATED_ENUMERATOR, + PL_TONE_MAP_MAX PL_DEPRECATED_ENUMERATOR, + PL_TONE_MAP_HYBRID PL_DEPRECATED_ENUMERATOR, + PL_TONE_MAP_LUMA PL_DEPRECATED_ENUMERATOR, + PL_TONE_MAP_MODE_COUNT, +}; + +// Deprecated by <libplacebo/gamut_mapping.h> +enum pl_gamut_mode { + PL_GAMUT_CLIP PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_clip + PL_GAMUT_WARN PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_highlight + PL_GAMUT_DARKEN PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_darken + PL_GAMUT_DESATURATE PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_desaturate + PL_GAMUT_MODE_COUNT, +}; + +struct pl_color_map_params { + // --- Gamut mapping options + + // Gamut mapping function to use to handle out-of-gamut colors, including + // colors which are out-of-gamut as a consequence of tone mapping. + const struct pl_gamut_map_function *gamut_mapping; + + // Gamut mapping constants, for expert tuning. Leave as default otherwise. + struct pl_gamut_map_constants gamut_constants; + + // Gamut mapping 3DLUT size, for channels ICh. Defaults to {48, 32, 256} + int lut3d_size[3]; + + // Use higher quality, but slower, tricubic interpolation for gamut mapping + // 3DLUTs. May substantially improve the 3DLUT gamut mapping accuracy, in + // particular at smaller 3DLUT sizes. Shouldn't have much effect at the + // default size. + bool lut3d_tricubic; + + // If true, allows the gamut mapping function to expand the gamut, in + // cases where the target gamut exceeds that of the source. If false, + // the source gamut will never be enlarged, even when using a gamut + // mapping function capable of bidirectional mapping. + bool gamut_expansion; + + // --- Tone mapping options + + // Tone mapping function to use to handle out-of-range colors. + const struct pl_tone_map_function *tone_mapping_function; + + // Tone mapping constants, for expert tuning. Leave as default otherwise. + struct pl_tone_map_constants tone_constants; + + // If true, and supported by the given tone mapping function, libplacebo + // will perform inverse tone mapping to expand the dynamic range of a + // signal. libplacebo is not liable for any HDR-induced eye damage. + bool inverse_tone_mapping; + + // Data source to use when tone-mapping. Setting this to a specific + // value allows overriding the default metadata preference logic. + enum pl_hdr_metadata_type metadata; + + // Tone mapping LUT size. Defaults to 256. + int lut_size; + + // HDR contrast recovery strength. If set to a value above 0.0, the source + // image will be divided into high-frequency and low-frequency components, + // and a portion of the high-frequency image is added back onto the + // tone-mapped output. May cause excessive ringing artifacts for some HDR + // sources, but can improve the subjective sharpness and detail left over + // in the image after tone-mapping. + float contrast_recovery; + + // Contrast recovery lowpass kernel size. Defaults to 3.5. Increasing + // or decreasing this will affect the visual appearance substantially. + float contrast_smoothness; + + // --- Debugging options + + // Force the use of a full tone-mapping LUT even for functions that have + // faster pure GLSL replacements (e.g. clip, linear, saturation). + bool force_tone_mapping_lut; + + // Visualize the tone-mapping LUT and gamut mapping 3DLUT, in IPT space. + bool visualize_lut; + + // Controls where to draw the visualization, relative to the rendered + // video (dimensions 0-1). Optional, defaults to the full picture. + pl_rect2df visualize_rect; + + // Controls the rotation of the 3DLUT visualization. + float visualize_hue; // useful range [-pi, pi] + float visualize_theta; // useful range [0, pi/2] + + // Graphically highlight hard-clipped pixels during tone-mapping (i.e. + // pixels that exceed the claimed source luminance range). + bool show_clipping; + + // --- Deprecated fields + enum pl_tone_map_mode tone_mapping_mode PL_DEPRECATED; // removed + float tone_mapping_param PL_DEPRECATED; // see `tone_constants` + float tone_mapping_crosstalk PL_DEPRECATED; // now hard-coded as 0.04 + enum pl_rendering_intent intent PL_DEPRECATED; // see `gamut_mapping` + enum pl_gamut_mode gamut_mode PL_DEPRECATED; // see `gamut_mapping` + float hybrid_mix PL_DEPRECATED; // removed +}; + +#define PL_COLOR_MAP_DEFAULTS \ + .gamut_mapping = &pl_gamut_map_perceptual, \ + .tone_mapping_function = &pl_tone_map_spline, \ + .gamut_constants = { PL_GAMUT_MAP_CONSTANTS }, \ + .tone_constants = { PL_TONE_MAP_CONSTANTS }, \ + .metadata = PL_HDR_METADATA_ANY, \ + .lut3d_size = {48, 32, 256}, \ + .lut_size = 256, \ + .visualize_rect = {0, 0, 1, 1}, \ + .contrast_smoothness = 3.5f, + +#define PL_COLOR_MAP_HQ_DEFAULTS \ + PL_COLOR_MAP_DEFAULTS \ + .contrast_recovery = 0.30f, + +#define pl_color_map_params(...) (&(struct pl_color_map_params) { PL_COLOR_MAP_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_color_map_params pl_color_map_default_params; +PL_API extern const struct pl_color_map_params pl_color_map_high_quality_params; + +// Execution arguments for the `pl_shader_color_map_ex` call. Distinct from +// `pl_color_map_params` because it is filled by internally-provided execution +// metadata, instead of user-tunable aesthetic parameters. +struct pl_color_map_args { + // Input/output color space for the mapping. + struct pl_color_space src; + struct pl_color_space dst; + + // If true, the logic will assume the input has already been linearized by + // the caller (e.g. as part of a previous linear light scaling operation). + bool prelinearized; + + // Object to be used to store generated LUTs. Note that this is the same + // state object used by `pl_shader_detect_peak`, and if that function has + // been called on `state` prior to `pl_shader_color_map`, the detected + // values will be used to guide the tone mapping algorithm. If this is not + // provided, tone/gamut mapping are disabled. + pl_shader_obj *state; + + // Low-resolution intensity feature map, as generated by + // `pl_shader_extract_features`. Optional. No effect if + // `params->contrast_recovery` is disabled. + pl_tex feature_map; +}; + +#define pl_color_map_args(...) (&(struct pl_color_map_args) { __VA_ARGS__ }) + +// Maps `vec4 color` from one color space to another color space according +// to the parameters (described in greater depth above). If `params` is left +// as NULL, it defaults to `&pl_color_map_default_params` +PL_API void pl_shader_color_map_ex(pl_shader sh, + const struct pl_color_map_params *params, + const struct pl_color_map_args *args); + +// Backwards compatibility wrapper around `pl_shader_color_map_ex` +PL_API void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params, + struct pl_color_space src, struct pl_color_space dst, + pl_shader_obj *state, bool prelinearized); + +// Applies a set of cone distortion parameters to `vec4 color` in a given color +// space. This can be used to simulate color blindness. See `pl_cone_params` +// for more information. +PL_API void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp, + const struct pl_cone_params *params); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_COLORSPACE_H_ diff --git a/src/include/libplacebo/shaders/custom.h b/src/include/libplacebo/shaders/custom.h new file mode 100644 index 0000000..a4eec69 --- /dev/null +++ b/src/include/libplacebo/shaders/custom.h @@ -0,0 +1,341 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SHADERS_CUSTOM_H_ +#define LIBPLACEBO_SHADERS_CUSTOM_H_ + +#include <stdlib.h> + +// Functions for writing custom shaders and hooking them into the `pl_renderer` +// pipeline, as well as compatibility functions for parsing shaders in mpv +// format. + +#include <libplacebo/shaders.h> +#include <libplacebo/dispatch.h> +#include <libplacebo/colorspace.h> + +PL_API_BEGIN + +// Parameters describing custom shader text to be embedded into a `pl_shader` +// object. All of the strings are optional and can be left as NULL, but without +// a `body` in particular, the shader will do nothing useful on its own. +struct pl_custom_shader { + // The prelude contains text such as extra #defines, #extension pragmas, + // or other parts of the shader that must be placed at the very + // beginning (before input layout declarations etc.) + // + // Note: #extension pragmas do not need to be emitted to enable support for + // resource types already attached to the shader (e.g. SSBOs), compute + // shaders, or GPU capabilities known to libplacebo (e.g. subgroups). + const char *prelude; + + // The header contains text such as helper function definitions, extra + // uniforms, shared memory variables or buffer descriptions. + const char *header; + + // A friendly name for the shader. (Optional) + const char *description; + + // The "primary" GLSL code. This will be effectively appended to the "main" + // function. It lives in an environment given by the `input` signature, and + // is expected to return results in a way given by the `output` signature. + // + // Note: In the case of PL_SHADER_SIG_COLOR, the output `vec4 color` is + // allocated by `pl_shader_custom`, the user merely needs to assign to it. + // + // Note: For ease of development it can be useful to have the main logic + // live inside a helper function defined as part of `header`, and specify + // the `body` as a single line that simply calls the helper function. + const char *body; + enum pl_shader_sig input; + enum pl_shader_sig output; + + // Extra descriptors, variables and vertex attributes to attach to the + // resulting `pl_shader_res`. + // + // Note: The names inside these will possibly be replaced by fresh + // identifiers internally, so users should avoid looking for exact string + // matches for the given names inside the `pl_shader_res`. + const struct pl_shader_desc *descriptors; + int num_descriptors; + const struct pl_shader_var *variables; + int num_variables; + const struct pl_shader_va *vertex_attribs; + int num_vertex_attribs; + const struct pl_shader_const *constants; + int num_constants; + + // If true, this shader must be a compute shader. The desired workgroup + // size and shared memory usage can be optionally specified, or 0 if no + // specific work group size or shared memory size restrictions apply. + // + // See also: `pl_shader_res.compute_group_size` + bool compute; + size_t compute_shmem; + int compute_group_size[2]; + + // Fixes the output size requirements of the shader to exact dimensions. + // Optional, if left as 0, means the shader can be dispatched at any size. + int output_w; + int output_h; +}; + +// Append custom shader code, including extra descriptors and variables, to an +// existing `pl_shader` object. Returns whether successful. This function may +// fail in the event that e.g. the custom shader requires compute shaders on +// an unsupported GPU, or exceeds the GPU's shared memory capabilities. +PL_API bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params); + +// Which "rendering stages" are available for user shader hooking purposes. +// Except where otherwise noted, all stages are "non-resizable", i.e. the +// shaders already have specific output size requirements. +enum pl_hook_stage { + // Hook stages for the untouched planes, as made available by the source. + // These are all resizable, i.e. there are no specific output stage + // requirements. + PL_HOOK_RGB_INPUT = 1 << 0, + PL_HOOK_LUMA_INPUT = 1 << 1, + PL_HOOK_CHROMA_INPUT = 1 << 2, + PL_HOOK_ALPHA_INPUT = 1 << 3, + PL_HOOK_XYZ_INPUT = 1 << 4, + + // Hook stages for the scaled/aligned planes + PL_HOOK_CHROMA_SCALED = 1 << 5, + PL_HOOK_ALPHA_SCALED = 1 << 6, + + PL_HOOK_NATIVE = 1 << 7, // Combined image in its native color space + PL_HOOK_RGB = 1 << 8, // After conversion to RGB (resizable) + PL_HOOK_LINEAR = 1 << 9, // After linearization but before scaling + PL_HOOK_SIGMOID = 1 << 10, // After sigmoidization + PL_HOOK_PRE_KERNEL = 1 << 11, // Immediately before the main scaler kernel + PL_HOOK_POST_KERNEL = 1 << 12, // Immediately after the main scaler kernel + PL_HOOK_SCALED = 1 << 13, // After scaling, before color management + PL_HOOK_PRE_OUTPUT = 1 << 14, // After color management, before blending/rotation + PL_HOOK_OUTPUT = 1 << 15, // After blending/rotation, before dithering +}; + +// Returns true if a given hook stage is resizable +static inline bool pl_hook_stage_resizable(enum pl_hook_stage stage) { + switch (stage) { + case PL_HOOK_RGB_INPUT: + case PL_HOOK_LUMA_INPUT: + case PL_HOOK_CHROMA_INPUT: + case PL_HOOK_ALPHA_INPUT: + case PL_HOOK_XYZ_INPUT: + case PL_HOOK_NATIVE: + case PL_HOOK_RGB: + return true; + + case PL_HOOK_CHROMA_SCALED: + case PL_HOOK_ALPHA_SCALED: + case PL_HOOK_LINEAR: + case PL_HOOK_SIGMOID: + case PL_HOOK_PRE_KERNEL: + case PL_HOOK_POST_KERNEL: + case PL_HOOK_SCALED: + case PL_HOOK_PRE_OUTPUT: + case PL_HOOK_OUTPUT: + return false; + } + + abort(); +} + +// The different forms of communicating image data between the renderer and +// the hooks +enum pl_hook_sig { + PL_HOOK_SIG_NONE, // No data is passed, no data is received/returned + PL_HOOK_SIG_COLOR, // `vec4 color` already pre-sampled in a `pl_shader` + PL_HOOK_SIG_TEX, // `pl_tex` containing the image data + PL_HOOK_SIG_COUNT, +}; + +struct pl_hook_params { + // GPU objects associated with the `pl_renderer`, which the user may + // use for their own purposes. + pl_gpu gpu; + pl_dispatch dispatch; + + // Helper function to fetch a new temporary texture, using renderer-backed + // storage. This is guaranteed to have sane image usage requirements and a + // 16-bit or floating point format. The user does not need to free/destroy + // this texture in any way. May return NULL. + pl_tex (*get_tex)(void *priv, int width, int height); + void *priv; + + // Which stage triggered the hook to run. + enum pl_hook_stage stage; + + // For `PL_HOOK_SIG_COLOR`, this contains the existing shader object with + // the color already pre-sampled into `vec4 color`. The user may modify + // this as much as they want, as long as they don't dispatch/finalize/reset + // it. + // + // Note that this shader might have specific output size requirements, + // depending on the exact shader stage hooked by the user, and may already + // be a compute shader. + pl_shader sh; + + // For `PL_HOOK_SIG_TEX`, this contains the texture that the user should + // sample from. + // + // Note: This texture object is owned by the renderer, and users must not + // modify its contents. It will not be touched for the duration of a frame, + // but the contents are lost in between frames. + pl_tex tex; + + // The effective current rectangle of the image we're rendering in this + // shader, i.e. the effective rect of the content we're interested in, + // as a crop of either `sh` or `tex` (depending on the signature). + // + // Note: This is still set even for `PL_HOOK_SIG_NONE`! + pl_rect2df rect; + + // The current effective colorspace and representation, of either the + // pre-sampled color (in `sh`), or the contents of `tex`, respectively. + // + // Note: This is still set even for `PL_HOOK_SIG_NONE`! + struct pl_color_repr repr; + struct pl_color_space color; + int components; + + // The representation and colorspace of the original image, for reference. + const struct pl_color_repr *orig_repr; + const struct pl_color_space *orig_color; + + // The (cropped) source and destination rectangles of the overall + // rendering. These are functionallty equivalent to `image.crop` and + // `target.crop`, respectively, but `src_rect` in particular may change as + // a result of previous hooks being executed. (e.g. prescalers) + pl_rect2df src_rect; + pl_rect2d dst_rect; +}; + +struct pl_hook_res { + // If true, the hook is assumed to have "failed" or errored in some way, + // and all other fields are ignored. + bool failed; + + // What type of output this hook is returning. + // Note: If this is `PL_HOOK_SIG_NONE`, all other fields are ignored. + enum pl_hook_sig output; + + // For `PL_HOOK_SIG_COLOR`, this *must* be set to a valid `pl_shader` + // object containing the sampled color value (i.e. with an output signature + // of `PL_SHADER_SIG_COLOR`), and *should* be allocated from the given + // `pl_dispatch` object. Ignored otherwise. + pl_shader sh; + + // For `PL_HOOK_SIG_TEX`, this *must* contain the texture object containing + // the result of rendering the hook. This *should* be a texture allocated + // using the given `get_tex` callback, to ensure the format and texture + // usage flags are compatible with what the renderer expects. + pl_tex tex; + + // For shaders that return some sort of output, this contains the + // new/altered versions of the existing "current texture" metadata. + struct pl_color_repr repr; + struct pl_color_space color; + int components; + + // This contains the new effective rect of the contents. This may be + // different from the original `rect` for resizable passes. Ignored for + // non-resizable passes. + pl_rect2df rect; +}; + +enum pl_hook_par_mode { + PL_HOOK_PAR_VARIABLE, // normal shader variable + PL_HOOK_PAR_DYNAMIC, // dynamic shader variable, e.g. per-frame changing + PL_HOOK_PAR_CONSTANT, // fixed at compile time (e.g. for array sizes), + // must be scalar (non-vector/matrix) + PL_HOOK_PAR_DEFINE, // defined in the preprocessor, must be `int` + PL_HOOK_PAR_MODE_COUNT, +}; + +typedef union pl_var_data { + int i; + unsigned u; + float f; +} pl_var_data; + +struct pl_hook_par { + // Name as used in the shader. + const char *name; + + // Type of this shader parameter, and how it's manifested in the shader. + enum pl_var_type type; + enum pl_hook_par_mode mode; + + // Human-readable explanation of this parameter. (Optional) + const char *description; + + // Mutable data pointer to current value of variable. + pl_var_data *data; + + // Default/initial value, and lower/upper bounds. + pl_var_data initial; + pl_var_data minimum; + pl_var_data maximum; + + // Human-readable names for the variants of an integer option. This array + // can be indexed directly by integer values, ranging from `minimum.i` to + // `maximum.i`. May be NULL, in which case options are unnamed. + const char * const *names; +}; + +// Struct describing a hook. +// +// Note: Users may freely create their own instances of this struct, there is +// nothing particularly special about `pl_mpv_user_shader_parse`. +struct pl_hook { + enum pl_hook_stage stages; // Which stages to hook on + enum pl_hook_sig input; // Which input signature this hook expects + void *priv; // Arbitrary user context + + // Custom tunable shader parameters exported by this hook. These may be + // updated at any time by the user, to influence the behavior of the hook. + // Contents are arbitrary and subject to the method of hook construction. + const struct pl_hook_par *parameters; + int num_parameters; + + // Called at the beginning of passes, to reset/initialize the hook. (Optional) + void (*reset)(void *priv); + + // The hook function itself. Called by the renderer at any of the indicated + // hook stages. See `pl_hook_res` for more info on the return values. + struct pl_hook_res (*hook)(void *priv, const struct pl_hook_params *params); + + // Unique signature identifying this hook, used to disable misbehaving hooks. + // All hooks with the same signature will be disabled, should they fail to + // execute during run-time. + uint64_t signature; +}; + +// Compatibility layer with `mpv` user shaders. See the mpv man page for more +// information on the format. Will return `NULL` if the shader fails parsing. +// +// The resulting `pl_hook` objects should be destroyed with the corresponding +// destructor when no longer needed. +PL_API const struct pl_hook * +pl_mpv_user_shader_parse(pl_gpu gpu, const char *shader_text, size_t shader_len); + +PL_API void pl_mpv_user_shader_destroy(const struct pl_hook **hook); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_CUSTOM_H_ diff --git a/src/include/libplacebo/shaders/deinterlacing.h b/src/include/libplacebo/shaders/deinterlacing.h new file mode 100644 index 0000000..40e74e8 --- /dev/null +++ b/src/include/libplacebo/shaders/deinterlacing.h @@ -0,0 +1,137 @@ + +/* + * This file is part of libplacebo, which is normally licensed under the terms + * of the LGPL v2.1+. However, this file (film_grain.h) is also available under + * the terms of the more permissive MIT license: + * + * Copyright (c) 2018-2019 Niklas Haas + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LIBPLACEBO_SHADERS_DEINTERLACING_H_ +#define LIBPLACEBO_SHADERS_DEINTERLACING_H_ + +#include <libplacebo/shaders.h> + +PL_API_BEGIN + +enum pl_field { + PL_FIELD_NONE = 0, // no deinterlacing + PL_FIELD_EVEN, // "top" fields, with even y coordinates + PL_FIELD_ODD, // "bottom" fields, with odd y coordinates + + // Convenience aliases + PL_FIELD_TOP = PL_FIELD_EVEN, + PL_FIELD_BOTTOM = PL_FIELD_ODD, +}; + +static inline enum pl_field pl_field_other(enum pl_field field) +{ + switch (field) { + case PL_FIELD_EVEN: return PL_FIELD_ODD; + case PL_FIELD_ODD: return PL_FIELD_EVEN; + default: return field; + } +} + +struct pl_field_pair { + // Top texture. If only this is specified, it's assumed to contain both + // fields in an interleaved fashion (MBAFF). + // + // Note: Support for separate fields (PAFF), is currently pending, so this + // is the only way to provide interlaced frames at the moment. + pl_tex top; +}; + +#define pl_field_pair(...) ((struct pl_field_pair) { __VA_ARGS__ }) + +struct pl_deinterlace_source { + // Previous, current and next source (interlaced) frames. `prev` and `next` + // may be NULL, but `cur` is required. If present, they must all have the + // exact same texture dimensions. + // + // Note: `prev` and `next` are only required for PL_DEINTERLACE_YADIF. + struct pl_field_pair prev, cur, next; + + // The parity of the current field to output. This field will be unmodified + // from `cur`, with the corresponding other field interpolated. + // + // If this is `PL_FIELD_NONE`, no deinterlacing is performed, and the + // texture is merely sampled as-is. + enum pl_field field; + + // The parity of the first frame in a stream. Set this the field that is + // (conceptually) ordered first in time. + // + // If this is `PL_FIELD_NONE`, it will instead default to `PL_FIELD_TOP`. + enum pl_field first_field; + + // Components to deinterlace. Components not specified will be ignored. + // Optional, if left as 0, all components will be deinterlaced. + uint8_t component_mask; +}; + +#define pl_deinterlace_source(...) (&(struct pl_deinterlace_source) { __VA_ARGS__ }) + +enum pl_deinterlace_algorithm { + // No-op deinterlacing, just sample the weaved frame un-touched. + PL_DEINTERLACE_WEAVE = 0, + + // Naive bob deinterlacing. Doubles the field lines vertically. + PL_DEINTERLACE_BOB, + + // "Yet another deinterlacing filter". Deinterlacer with temporal and + // spatial information. Based on FFmpeg's Yadif filter algorithm, but + // adapted slightly for the GPU. + PL_DEINTERLACE_YADIF, + + PL_DEINTERLACE_ALGORITHM_COUNT, +}; + +// Returns whether or not an algorithm requires `prev`/`next` refs to be set. +static inline bool pl_deinterlace_needs_refs(enum pl_deinterlace_algorithm algo) +{ + return algo == PL_DEINTERLACE_YADIF; +} + +struct pl_deinterlace_params { + // Algorithm to use. The recommended default is PL_DEINTERLACE_YADIF, which + // provides a good trade-off of quality and speed. + enum pl_deinterlace_algorithm algo; + + // Skip the spatial interlacing check. (PL_DEINTERLACE_YADIF only) + bool skip_spatial_check; +}; + +#define PL_DEINTERLACE_DEFAULTS \ + .algo = PL_DEINTERLACE_YADIF, + +#define pl_deinterlace_params(...) (&(struct pl_deinterlace_params) { PL_DEINTERLACE_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_deinterlace_params pl_deinterlace_default_params; + +// Deinterlaces a set of interleaved source frames and outputs the result into +// `vec4 color`. If `params` is left as NULL, it defaults to +// `&pl_deinterlace_default_params`. +PL_API void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src, + const struct pl_deinterlace_params *params); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_DEINTERLACING_H_ diff --git a/src/include/libplacebo/shaders/dithering.h b/src/include/libplacebo/shaders/dithering.h new file mode 100644 index 0000000..9146c81 --- /dev/null +++ b/src/include/libplacebo/shaders/dithering.h @@ -0,0 +1,140 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SHADERS_DITHERING_H_ +#define LIBPLACEBO_SHADERS_DITHERING_H_ + +// Dithering shaders + +#include <libplacebo/colorspace.h> +#include <libplacebo/dither.h> +#include <libplacebo/shaders.h> + +PL_API_BEGIN + +enum pl_dither_method { + // Dither with blue noise. Very high quality, but requires the use of a + // LUT. Warning: Computing a blue noise texture with a large size can be + // very slow, however this only needs to be performed once. Even so, using + // this with a `lut_size` greater than 6 is generally ill-advised. This is + // the preferred/default dither method. + PL_DITHER_BLUE_NOISE, + + // Dither with an ordered (bayer) dither matrix, using a LUT. Low quality, + // and since this also uses a LUT, there's generally no advantage to picking + // this instead of `PL_DITHER_BLUE_NOISE`. It's mainly there for testing. + PL_DITHER_ORDERED_LUT, + + // The same as `PL_DITHER_ORDERED_LUT`, but uses fixed function math instead + // of a LUT. This is faster, but only supports a fixed dither matrix size + // of 16x16 (equal to a `lut_size` of 4). + PL_DITHER_ORDERED_FIXED, + + // Dither with white noise. This does not require a LUT and is fairly cheap + // to compute. Unlike the other modes it doesn't show any repeating + // patterns either spatially or temporally, but the downside is that this + // is visually fairly jarring due to the presence of low frequencies in the + // noise spectrum. + PL_DITHER_WHITE_NOISE, + + PL_DITHER_METHOD_COUNT, +}; + +struct pl_dither_params { + // The source of the dither noise to use. + enum pl_dither_method method; + + // For the dither methods which require the use of a LUT, this controls + // the size of the LUT (base 2). If left as NULL, this defaults to 6, which + // is equivalent to a 64x64 dither matrix. Must not be larger than 8. + int lut_size; + + // Enables temporal dithering. This reduces the persistence of dithering + // artifacts by perturbing the dithering matrix per frame. + // Warning: This can cause nasty aliasing artifacts on some LCD screens. + bool temporal; + + // Gamma function to use for dither gamma correction. This will only have + // an effect when dithering to low bit depths (<= 4). + enum pl_color_transfer transfer; +}; + +#define PL_DITHER_DEFAULTS \ + .method = PL_DITHER_BLUE_NOISE, \ + .lut_size = 6, \ + /* temporal dithering commonly flickers on LCDs */ \ + .temporal = false, + +#define pl_dither_params(...) (&(struct pl_dither_params) { PL_DITHER_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_dither_params pl_dither_default_params; + +// Dither the colors to a lower depth, given in bits. This can be used on input +// colors of any precision. Basically, this rounds the colors to only linear +// multiples of the stated bit depth. The average intensity of the result +// will not change (i.e., the dither noise is balanced in both directions). +// If `params` is NULL, it defaults to &pl_dither_default_params. +// +// For the dither methods which require the use of a LUT, `dither_state` must +// be set to a valid pointer. To avoid thrashing the resource, users should +// avoid trying to re-use the same LUT for different dither configurations. If +// passed as NULL, libplacebo will automatically fall back to dither algorithms +// that don't require the use of a LUT. +// +// Warning: This dithering algorithm is not gamma-invariant; so using it for +// very low bit depths (below 4 or so) will noticeably increase the brightness +// of the resulting image. When doing low bit depth dithering for aesthetic +// purposes, it's recommended that the user explicitly (de)linearize the colors +// before and after this algorithm. +PL_API void pl_shader_dither(pl_shader sh, int new_depth, + pl_shader_obj *dither_state, + const struct pl_dither_params *params); + +struct pl_error_diffusion_params { + // Both the input and output texture must be provided up-front, with the + // same size. The output texture must be storable, and the input texture + // must be sampleable. + pl_tex input_tex; + pl_tex output_tex; + + // Depth to dither to. Required. + int new_depth; + + // Error diffusion kernel to use. Optional. If unspecified, defaults to + // `&pl_error_diffusion_sierra_lite`. + const struct pl_error_diffusion_kernel *kernel; +}; + +#define pl_error_diffusion_params(...) (&(struct pl_error_diffusion_params) { __VA_ARGS__ }) + +// Computes the shared memory requirements for a given error diffusion kernel. +// This can be used to test up-front whether or not error diffusion would be +// supported or not, before having to initialize textures. +PL_API size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel, + int height); + +// Apply an error diffusion dithering kernel. This is a much more expensive and +// heavy dithering method, and is not generally recommended for realtime usage +// where performance is critical. +// +// Requires compute shader support. Returns false if dithering fail e.g. as a +// result of shader memory limits being exceeded. The resulting shader must be +// dispatched with a work group count of exactly 1. +PL_API bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_DITHERING_H_ diff --git a/src/include/libplacebo/shaders/film_grain.h b/src/include/libplacebo/shaders/film_grain.h new file mode 100644 index 0000000..8a9c78b --- /dev/null +++ b/src/include/libplacebo/shaders/film_grain.h @@ -0,0 +1,137 @@ +/* + * This file is part of libplacebo, which is normally licensed under the terms + * of the LGPL v2.1+. However, this file (film_grain.h) is also available under + * the terms of the more permissive MIT license: + * + * Copyright (c) 2018-2019 Niklas Haas + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LIBPLACEBO_SHADERS_FILM_GRAIN_H_ +#define LIBPLACEBO_SHADERS_FILM_GRAIN_H_ + +// Film grain synthesis shaders for AV1 / H.274. + +#include <stdint.h> +#include <stdbool.h> + +#include <libplacebo/colorspace.h> +#include <libplacebo/shaders.h> + +PL_API_BEGIN + +enum pl_film_grain_type { + PL_FILM_GRAIN_NONE = 0, + PL_FILM_GRAIN_AV1, + PL_FILM_GRAIN_H274, + PL_FILM_GRAIN_COUNT, +}; + +// AV1 film grain parameters. For the exact meaning of these, see the AV1 +// specification (section 6.8.20). +struct pl_av1_grain_data { + int num_points_y; + uint8_t points_y[14][2]; // [n][0] = value, [n][1] = scaling + bool chroma_scaling_from_luma; + int num_points_uv[2]; // should be {0} for grayscale images + uint8_t points_uv[2][10][2]; // like points_y for points_uv[0, 1] = u, v + int scaling_shift; + int ar_coeff_lag; + int8_t ar_coeffs_y[24]; + int8_t ar_coeffs_uv[2][25]; + int ar_coeff_shift; + int grain_scale_shift; + int8_t uv_mult[2]; + int8_t uv_mult_luma[2]; + int16_t uv_offset[2]; // 9-bit value, range [-256, 255] + bool overlap; +}; + +// H.274 film grain parameters. For the exact meaning of these, see the H.274 +// specification (section 8.5). +struct pl_h274_grain_data { + int model_id; + int blending_mode_id; + int log2_scale_factor; + bool component_model_present[3]; + uint16_t num_intensity_intervals[3]; + uint8_t num_model_values[3]; + const uint8_t *intensity_interval_lower_bound[3]; + const uint8_t *intensity_interval_upper_bound[3]; + const int16_t (*comp_model_value[3])[6]; +}; + +// Tagged union for film grain data +struct pl_film_grain_data { + enum pl_film_grain_type type; // film grain type + uint64_t seed; // shared seed value + + union { + // Warning: These values are not sanity-checked at all, Invalid grain + // data results in undefined behavior! + struct pl_av1_grain_data av1; + struct pl_h274_grain_data h274; + } params; +}; + +// Options for the `pl_shader_film_grain` call. +struct pl_film_grain_params { + // Required for all film grain types: + struct pl_film_grain_data data; // film grain data + pl_tex tex; // texture to sample from + struct pl_color_repr *repr; // underlying color representation (see notes) + int components; + int component_mapping[4]; // same as `struct pl_plane` + + // Notes for `repr`: + // - repr->bits affects the rounding for grain generation + // - repr->levels affects whether or not we clip to full range or not + // - repr->sys affects the interpretation of channels + // - *repr gets normalized by this shader, which is why it's a pointer + + // Required for PL_FILM_GRAIN_AV1 only: + pl_tex luma_tex; // "luma" texture (see notes) + int luma_comp; // index of luma in `luma_tex` + + // Notes for `luma_tex`: + // - `luma_tex` must be specified if the `tex` does not itself contain the + // "luma-like" component. For XYZ systems, the Y channel is the luma + // component. For RGB systems, the G channel is. +}; + +#define pl_film_grain_params(...) (&(struct pl_film_grain_params) { __VA_ARGS__ }) + +// Test if film grain needs to be applied. This is a helper function that users +// can use to decide whether or not `pl_shader_film_grain` needs to be called, +// based on the given grain metadata. +PL_API bool pl_needs_film_grain(const struct pl_film_grain_params *params); + +// Sample from a texture while applying film grain at the same time. +// `grain_state` must be unique for every plane configuration, as it may +// contain plane-dependent state. +// +// Returns false on any error, or if film grain generation is not supported +// due to GLSL limitations. +PL_API bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state, + const struct pl_film_grain_params *params); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_FILM_GRAIN_H_ diff --git a/src/include/libplacebo/shaders/icc.h b/src/include/libplacebo/shaders/icc.h new file mode 100644 index 0000000..a4003f4 --- /dev/null +++ b/src/include/libplacebo/shaders/icc.h @@ -0,0 +1,135 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SHADERS_ICC_H_ +#define LIBPLACEBO_SHADERS_ICC_H_ + +// Functions for generating and applying ICC-derived (3D)LUTs + +#include <libplacebo/colorspace.h> +#include <libplacebo/shaders.h> + +PL_API_BEGIN + +struct pl_icc_params { + // The rendering intent to use, for profiles with multiple intents. A + // recommended value is PL_INTENT_RELATIVE_COLORIMETRIC for color-accurate + // video reproduction, or PL_INTENT_PERCEPTUAL for profiles containing + // meaningful perceptual mapping tables for some more suitable color space + // like BT.709. + // + // If this is set to the special value PL_INTENT_AUTO, will use the + // preferred intent provided by the profile header. + enum pl_rendering_intent intent; + + // The size of the 3DLUT to generate. If left as NULL, these individually + // default to values appropriate for the profile. (Based on internal + // precision heuristics) + // + // Note: Setting this manually is strongly discouraged, as it can result + // in excessively high 3DLUT sizes where a much smaller LUT would have + // sufficed. + int size_r, size_g, size_b; + + // This field can be used to override the detected brightness level of the + // ICC profile. If you set this to the special value 0 (or a negative + // number), libplacebo will attempt reading the brightness value from the + // ICC profile's tagging (if available), falling back to PL_COLOR_SDR_WHITE + // if unavailable. + float max_luma; + + // Force black point compensation. May help avoid crushed or raised black + // points on "improper" profiles containing e.g. colorimetric tables that + // do not round-trip. Should not be required on well-behaved profiles, + // or when using PL_INTENT_PERCEPTUAL, but YMMV. + bool force_bpc; + + // If provided, this pl_cache instance will be used, instead of the + // GPU-internal cache, to cache the generated 3DLUTs. Note that these can + // get large, especially for large values of size_{r,g,b}, so the user may + // wish to split this cache off from the main shader cache. (Optional) + pl_cache cache; + + // Deprecated legacy caching API. Replaced by `cache`. + PL_DEPRECATED void *cache_priv; + PL_DEPRECATED void (*cache_save)(void *priv, uint64_t sig, const uint8_t *cache, size_t size); + PL_DEPRECATED bool (*cache_load)(void *priv, uint64_t sig, uint8_t *cache, size_t size); +}; + +#define PL_ICC_DEFAULTS \ + .intent = PL_INTENT_RELATIVE_COLORIMETRIC, \ + .max_luma = PL_COLOR_SDR_WHITE, + +#define pl_icc_params(...) (&(struct pl_icc_params) { PL_ICC_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_icc_params pl_icc_default_params; + +// This object represents a "parsed" ICC profile. +typedef const struct pl_icc_object_t { + // Provided params, with the `intent` and `size` fields set (as described) + struct pl_icc_params params; + + // Signature of the corresponding ICC profile. + uint64_t signature; + + // Detected color space (or UNKNOWN for profiles which don't contain an + // exact match), with HDR metedata set to the detected gamut and + // white/black value ranges. + struct pl_color_space csp; + + // Best estimate of profile gamma. This only serves as a rough guideline. + float gamma; + + // Smallest containing primary set, always set. + enum pl_color_primaries containing_primaries; +} *pl_icc_object; + +// Attempts opening/parsing the contents of an ICC profile. The resulting +// object is memory managed and may outlive the original profile - access +// to the underlying profile is no longer needed once this returns. +PL_API pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile, + const struct pl_icc_params *params); +PL_API void pl_icc_close(pl_icc_object *icc); + +// Update an existing pl_icc_object, which may be NULL, replacing it by the +// new profile and parameters (if incompatible). +// +// Returns success. `obj` is set to the created profile, or NULL on error. +// +// Note: If `profile->signature` matches `(*obj)->signature`, or if `profile` is +// NULL, then the existing profile is directly reused, with only the effective +// parameters changing. In this case, `profile->data` is also *not* read from, +// and may safely be NULL. +PL_API bool pl_icc_update(pl_log log, pl_icc_object *obj, + const struct pl_icc_profile *profile, + const struct pl_icc_params *params); + +// Decode the input from the colorspace determined by the attached ICC profile +// to linear light RGB (in the profile's containing primary set). `lut` must be +// set to a shader object that will store the GPU resources associated with the +// generated LUT. The resulting color space will be written to `out_csp`. +PL_API void pl_icc_decode(pl_shader sh, pl_icc_object profile, pl_shader_obj *lut, + struct pl_color_space *out_csp); + +// Encode the input from linear light RGB (in the profile's containing primary +// set) into the colorspace determined by the attached ICC profile. `lut` must +// be set to a shader object that will store the GPU resources associated with +// the generated LUT. +PL_API void pl_icc_encode(pl_shader sh, pl_icc_object profile, pl_shader_obj *lut); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_ICC_H_ diff --git a/src/include/libplacebo/shaders/lut.h b/src/include/libplacebo/shaders/lut.h new file mode 100644 index 0000000..6e30ddc --- /dev/null +++ b/src/include/libplacebo/shaders/lut.h @@ -0,0 +1,78 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SHADERS_LUT_H_ +#define LIBPLACEBO_SHADERS_LUT_H_ + +// Shaders for loading and applying arbitrary custom 1D/3DLUTs + +#include <libplacebo/colorspace.h> +#include <libplacebo/shaders.h> + +PL_API_BEGIN + +// Struct defining custom LUTs +// +// Note: Users may freely create their own instances of this struct, there is +// nothing particularly special about `pl_lut_parse_cube`. +struct pl_custom_lut { + // Some unique signature identifying this LUT, needed to detect state + // changes (for cache invalidation). This should ideally be a hash of the + // file contents. (Which is what `pl_lut_parse_*` will set it to.) + uint64_t signature; + + // Size of each dimension, in the order R, G, B. For 1D LUTs, only the R + // dimension should be specified (the others left as 0). + int size[3]; + + // Raw LUT data itself, in properly scaled floating point format. For 3D + // LUTs, the innermost dimension is the first dimension (R), and the + // outermost dimension is the last dimension (B). Individual color samples + // are in the order R, G, B. + const float *data; + + // Extra input/output shaper matrices. Ignored if equal to {0}. This is + // mostly useful for 1D LUTs, since 3D LUTs can bake the shaper matrix into + // the LUT itself - but it can still help optimize LUT precision. + pl_matrix3x3 shaper_in, shaper_out; + + // Nominal metadata for the input/output of a LUT. Left as {0} if unknown. + // Note: This is purely informative, `pl_shader_custom_lut` ignores it. + struct pl_color_repr repr_in, repr_out; + struct pl_color_space color_in, color_out; +}; + +// Parse a 3DLUT in .cube format. Returns NULL if the file fails parsing. +PL_API struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *str, size_t str_len); + +// Frees a LUT created by `pl_lut_parse_*`. +PL_API void pl_lut_free(struct pl_custom_lut **lut); + +// Apply a `pl_custom_lut`. The user is responsible for ensuring colors going +// into the LUT are in the expected format as informed by the LUT metadata. +// +// `lut_state` must be a pointer to a NULL-initialized shader state object that +// will be used to encapsulate any required GPU state. +// +// Note: `lut` does not have to be allocated by `pl_lut_parse_*`. It can be a +// struct filled out by the user. +PL_API void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut, + pl_shader_obj *lut_state); + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_LUT_H_ diff --git a/src/include/libplacebo/shaders/sampling.h b/src/include/libplacebo/shaders/sampling.h new file mode 100644 index 0000000..5221e44 --- /dev/null +++ b/src/include/libplacebo/shaders/sampling.h @@ -0,0 +1,257 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SHADERS_SAMPLING_H_ +#define LIBPLACEBO_SHADERS_SAMPLING_H_ + +// Sampling operations. These shaders perform some form of sampling operation +// from a given pl_tex. In order to use these, the pl_shader *must* have been +// created using the same `gpu` as the originating `pl_tex`. Otherwise, this +// is undefined behavior. They require nothing (PL_SHADER_SIG_NONE) and return +// a color (PL_SHADER_SIG_COLOR). + +#include <libplacebo/colorspace.h> +#include <libplacebo/filters.h> +#include <libplacebo/shaders.h> + +PL_API_BEGIN + +// Common parameters for sampling operations +struct pl_sample_src { + // There are two mutually exclusive ways of providing the source to sample + // from: + // + // 1. Provide the texture and sampled region directly. This generates + // a shader with input signature `PL_SHADER_SIG_NONE`, which binds the + // texture as a descriptor (and the coordinates as a vertex attribute) + pl_tex tex; // texture to sample + pl_rect2df rect; // sub-rect to sample from (optional) + enum pl_tex_address_mode address_mode; // preferred texture address mode + + // 2. Have the shader take it as an argument. Doing this requires + // specifying the missing metadata of the texture backing the sampler, so + // that the shader generation can generate the correct code. + int tex_w, tex_h; // dimensions of the actual texture + enum pl_fmt_type format; // format of the sampler being accepted + enum pl_sampler_type sampler; // type of the sampler being accepted + enum pl_tex_sample_mode mode; // sample mode of the sampler being accepted + float sampled_w, sampled_h; // dimensions of the sampled region (optional) + + // Common metadata for both sampler input types: + int components; // number of components to sample (optional) + uint8_t component_mask; // bitmask of components to sample (optional) + int new_w, new_h; // dimensions of the resulting output (optional) + float scale; // factor to multiply into sampled signal (optional) + + // Note: `component_mask` and `components` are mutually exclusive, the + // former is preferred if both are specified. +}; + +#define pl_sample_src(...) (&(struct pl_sample_src) { __VA_ARGS__ }) + +struct pl_deband_params { + // The number of debanding steps to perform per sample. Each step reduces a + // bit more banding, but takes time to compute. Note that the strength of + // each step falls off very quickly, so high numbers (>4) are practically + // useless. Defaults to 1. + int iterations; + + // The debanding filter's cut-off threshold. Higher numbers increase the + // debanding strength dramatically, but progressively diminish image + // details. Defaults to 3.0. + float threshold; + + // The debanding filter's initial radius. The radius increases linearly + // for each iteration. A higher radius will find more gradients, but a + // lower radius will smooth more aggressively. Defaults to 16.0. + float radius; + + // Add some extra noise to the image. This significantly helps cover up + // remaining quantization artifacts. Higher numbers add more noise. + // Note: When debanding HDR sources, even a small amount of grain can + // result in a very big change to the brightness level. It's recommended to + // either scale this value down or disable it entirely for HDR. + // + // Defaults to 4.0, which is very mild. + float grain; + + // 'Neutral' grain value for each channel being debanded (sorted in order + // from low to high index). Grain application will be modulated to avoid + // disturbing colors close to this value. Set this to a value corresponding + // to black in the relevant colorspace. + float grain_neutral[3]; +}; + +#define PL_DEBAND_DEFAULTS \ + .iterations = 1, \ + .threshold = 3.0, \ + .radius = 16.0, \ + .grain = 4.0, + +#define pl_deband_params(...) (&(struct pl_deband_params) {PL_DEBAND_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_deband_params pl_deband_default_params; + +// Debands a given texture and returns the sampled color in `vec4 color`. If +// `params` is left as NULL, it defaults to &pl_deband_default_params. Note +// that `tex->params.format` must have PL_FMT_CAP_LINEAR. When the given +// `pl_sample_src` implies scaling, this effectively performs bilinear +// sampling on the input (but not the output). +// +// Note: This can also be used as a pure grain function, by setting the number +// of iterations to 0. +PL_API void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src, + const struct pl_deband_params *params); + +// Performs direct / native texture sampling, using whatever texture filter is +// available (linear for linearly sampleable sources, nearest otherwise). +// +// Note: This is generally very low quality and should be avoided if possible, +// for both upscaling and downscaling. +PL_API bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src); + +// Performs hardware-accelerated nearest neighbour sampling. This is similar to +// `pl_shader_sample_direct`, but forces nearest neighbour interpolation. +PL_API bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src); + +// Performs hardware-accelerated bilinear sampling. This is similar to +// `pl_shader_sample_direct`, but forces bilinear interpolation. +PL_API bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src); + +// Optimized versions of specific, strictly positive scaler kernels that take +// adantage of linear texture sampling to reduce the number of fetches needed +// by a factor of four. This family of functions performs radius-2 scaling +// with only four texture fetches, which is far more efficient than using +// the generalized 1D scaling method. Only works well for upscaling. +PL_API bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src); +PL_API bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src); +PL_API bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src); + +// A sampler that is similar to nearest neighbour sampling, but tries to +// preserve pixel aspect ratios. This is mathematically equivalent to taking an +// idealized image with square pixels, sampling it at an infinite resolution, +// and then downscaling that to the desired resolution. (Hence it being called +// "oversample"). Good for pixel art. +// +// The threshold provides a cutoff threshold below which the contribution of +// pixels should be ignored, trading some amount of aspect ratio distortion for +// a slightly crisper image. A value of `threshold == 0.5` makes this filter +// equivalent to regular nearest neighbour sampling. +PL_API bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src, + float threshold); + +struct pl_sample_filter_params { + // The filter to use for sampling. + struct pl_filter_config filter; + + // Antiringing strength. A value of 0.0 disables antiringing, and a value + // of 1.0 enables full-strength antiringing. Defaults to 0.0 if + // unspecified. + // + // Note: Ignored if `filter.antiring` is already set to something nonzero. + float antiring; + + // Disable the use of compute shaders (e.g. if rendering to non-storable tex) + bool no_compute; + // Disable the use of filter widening / anti-aliasing (for downscaling) + bool no_widening; + + // This shader object is used to store the LUT, and will be recreated + // if necessary. To avoid thrashing the resource, users should avoid trying + // to re-use the same LUT for different filter configurations or scaling + // ratios. Must be set to a valid pointer, and the target NULL-initialized. + pl_shader_obj *lut; + + // Deprecated / removed fields + int lut_entries PL_DEPRECATED; // hard-coded as 256 + float cutoff PL_DEPRECATED; // hard-coded as 1e-3 +}; + +#define pl_sample_filter_params(...) (&(struct pl_sample_filter_params) { __VA_ARGS__ }) + +// Performs polar sampling. This internally chooses between an optimized compute +// shader, and various fragment shaders, depending on the supported GLSL version +// and GPU features. Returns whether or not it was successful. +// +// Note: `params->filter.polar` must be true to use this function. +PL_API bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src, + const struct pl_sample_filter_params *params); + +// Performs orthogonal (1D) sampling. Using this twice in a row (once vertical +// and once horizontal) effectively performs a 2D upscale. This is lower +// quality than polar sampling, but significantly faster, and therefore the +// recommended default. Returns whether or not it was successful. +// +// `src` must represent a scaling operation that only scales in one direction, +// i.e. either only X or only Y. The other direction must be left unscaled. +// +// Note: Due to internal limitations, this may currently only be used on 2D +// textures - even though the basic principle would work for 1D and 3D textures +// as well. +PL_API bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src, + const struct pl_sample_filter_params *params); + +struct pl_distort_params { + // An arbitrary 2x2 affine transformation to apply to the input image. + // For simplicity, the input image is explicitly centered and scaled such + // that the longer dimension is in [-1,1], before applying this. + pl_transform2x2 transform; + + // If true, the texture is placed inside the center of the canvas without + // scaling. If false, it is effectively stretched to the canvas size. + bool unscaled; + + // If true, the transformation is automatically scaled down and shifted to + // ensure that the resulting image fits inside the output canvas. + bool constrain; + + // If true, use bicubic interpolation rather than faster bilinear + // interpolation. Higher quality but slower. + bool bicubic; + + // Specifies the texture address mode to use when sampling out of bounds. + enum pl_tex_address_mode address_mode; + + // If set, all out-of-bounds accesses will instead be treated as + // transparent, according to the given alpha mode. (Which should match the + // alpha mode of the texture) + // + // Note: `address_mode` has no effect when this is specified. + enum pl_alpha_mode alpha_mode; +}; + +#define PL_DISTORT_DEFAULTS \ + .transform.mat.m = {{ 1, 0 }, {0, 1}}, + +#define pl_distort_params(...) (&(struct pl_distort_params) {PL_DISTORT_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_distort_params pl_distort_default_params; + +// Distorts the input image using a given set of transformation parameters. +// `out_w` and `out_h` determine the size of the effective canvas inside which +// the distorted result may be rendered. Areas outside of this canvas will +// be implicitly cut off. +PL_API void pl_shader_distort(pl_shader sh, pl_tex tex, int out_w, int out_h, + const struct pl_distort_params *params); + +enum PL_DEPRECATED { // for `int pass` + PL_SEP_VERT = 0, + PL_SEP_HORIZ, + PL_SEP_PASSES +}; + +PL_API_END + +#endif // LIBPLACEBO_SHADERS_SAMPLING_H_ diff --git a/src/include/libplacebo/swapchain.h b/src/include/libplacebo/swapchain.h new file mode 100644 index 0000000..b53aa5c --- /dev/null +++ b/src/include/libplacebo/swapchain.h @@ -0,0 +1,171 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_SWAPCHAIN_H_ +#define LIBPLACEBO_SWAPCHAIN_H_ + +#include <libplacebo/common.h> +#include <libplacebo/colorspace.h> +#include <libplacebo/gpu.h> + +PL_API_BEGIN + +// This abstraction represents a low-level interface to visible surfaces +// exposed by a graphics API (and accompanying GPU instance), allowing users to +// directly present frames to the screen (or window, typically). This is a +// sister API to gpu.h and follows the same convention w.r.t undefined behavior. +// +// Thread-safety: Safe +typedef const struct pl_swapchain_t { + pl_log log; + pl_gpu gpu; +} *pl_swapchain; + +// Destroys this swapchain. May be used at any time, and may block until the +// completion of all outstanding rendering commands. The swapchain and any +// resources retrieved from it must not be used afterwards. +PL_API void pl_swapchain_destroy(pl_swapchain *sw); + +// Returns the approximate current swapchain latency in vsyncs, or 0 if +// unknown. A latency of 1 means that `submit_frame` followed by `swap_buffers` +// will block until the just-submitted frame has finished rendering. Typical +// values are 2 or 3, which enable better pipelining by allowing the GPU to be +// processing one or two frames at the same time as the user is preparing the +// next for submission. +PL_API int pl_swapchain_latency(pl_swapchain sw); + +// Update/query the swapchain size. This function performs both roles: it tries +// setting the swapchain size to the values requested by the user, and returns +// in the same variables what width/height the swapchain was actually set to - +// which may be (substantially) different from the values requested by the +// user. A value of 0 means "unknown/none" (in which case, libplacebo won't try +// updating the size - it will simply return the current state of the +// swapchain). It's also possible for libplacebo to return values of 0, such as +// in the case that the swapchain doesn't exist yet. +// +// Returns false on significant errors (e.g. dead surface). This function can +// effectively be used to probe if creating a swapchain works. +PL_API bool pl_swapchain_resize(pl_swapchain sw, int *width, int *height); + +// Backwards compatibility +#define pl_swapchain_colors pl_color_space + +// Inform the swapchain about the input color space. This API deliberately +// provides no feedback, because the swapchain can internally decide what to do +// with this information, including ignoring it entirely, or applying it +// asynchronously. Users must still base their rendering on the value of +// `pl_swapchain_frame.color_space`. +// +// Note: Calling this function a second time completely overrides any +// previously specified hint. So calling this on {0} or NULL resets the +// swapchain back to its initial/preferred colorspace. +// +// Note: If `csp->transfer` is a HDR transfer curve but HDR metadata is left +// unspecified, the HDR metadata defaults to `pl_hdr_metadata_hdr10`. +// Conversely, if the HDR metadata is non-empty but `csp->transfer` is left as +// PL_COLOR_TRC_UNKNOWN, then it instead defaults to PL_COLOR_TRC_PQ. +PL_API void pl_swapchain_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp); + +// The struct used to hold the results of `pl_swapchain_start_frame` +struct pl_swapchain_frame { + // A texture representing the framebuffer users should use for rendering. + // It's guaranteed that `fbo->params.renderable` and `fbo->params.blit_dst` + // will be true, but no other guarantees are made - not even that + // `fbo->params.format` is a real format. + pl_tex fbo; + + // If true, the user should assume that this framebuffer will be flipped + // as a result of presenting it on-screen. If false, nothing special needs + // to be done - but if true, users should flip the coordinate system of + // the `pl_pass` that is rendering to this framebuffer. + // + // Note: Normally, libplacebo follows the convention that (0,0) represents + // the top left of the image/screen. So when flipped is true, this means + // (0,0) on this framebuffer gets displayed as the bottom left of the image. + bool flipped; + + // Indicates the color representation this framebuffer will be interpreted + // as by the host system / compositor / display, including the bit depth + // and alpha handling (where available). + struct pl_color_repr color_repr; + struct pl_color_space color_space; +}; + +// Retrieve a new frame from the swapchain. Returns whether successful. It's +// worth noting that this function can fail sporadically for benign reasons, +// for example the window being invisible or inaccessible. This function may +// block until an image is available, which may be the case if the GPU is +// rendering frames significantly faster than the display can output them. It +// may also be non-blocking, so users shouldn't rely on this call alone in +// order to meter rendering speed. (Specifics depend on the underlying graphics +// API) +PL_API bool pl_swapchain_start_frame(pl_swapchain sw, struct pl_swapchain_frame *out_frame); + +// Submits the previously started frame. Non-blocking. This must be issued in +// lockstep with pl_swapchain_start_frame - there is no way to start multiple +// frames and submit them out-of-order. The frames submitted this way will +// generally be made visible in a first-in first-out fashion, although +// specifics depend on the mechanism used to create the pl_swapchain. (See the +// platform-specific APIs for more info). +// +// Returns whether successful. This should normally never fail, unless the +// GPU/surface has been lost or some other critical error has occurred. The +// "started" frame is consumed even in the event of failure. +// +// Note that `start_frame` and `submit_frame` form a lock pair, i.e. trying to +// call e.g. `pl_swapchain_resize` from another thread will block until +// `pl_swapchain_submit_frame` is finished. +PL_API bool pl_swapchain_submit_frame(pl_swapchain sw); + +// Performs a "buffer swap", or some generalization of the concept. In layman's +// terms, this blocks until the execution of the Nth previously submitted frame +// has been "made complete" in some sense. (The N derives from the swapchain's +// built-in latency. See `pl_swapchain_latency` for more information). +// +// Users should include this call in their rendering loops in order to make +// sure they aren't submitting rendering commands faster than the GPU can +// process them, which would potentially lead to a queue overrun or exhaust +// memory. +// +// An example loop might look like this: +// +// while (rendering) { +// struct pl_swapchain_frame frame; +// bool ok = pl_swapchain_start_frame(swapchain, &frame); +// if (!ok) { +// /* wait some time, or decide to stop rendering */ +// continue; +// } +// +// /* do some rendering with frame.fbo */ +// +// ok = pl_swapchain_submit_frame(swapchain); +// if (!ok) +// break; +// +// pl_swapchain_swap_buffers(swapchain); +// } +// +// The duration this function blocks for, if at all, may be very inconsistent +// and should not be used as an authoritative source of vsync timing +// information without sufficient smoothing/filtering (and if so, the time that +// `start_frame` blocked for should also be included). +PL_API void pl_swapchain_swap_buffers(pl_swapchain sw); + +PL_API_END + +#endif // LIBPLACEBO_SWAPCHAIN_H_ diff --git a/src/include/libplacebo/tone_mapping.h b/src/include/libplacebo/tone_mapping.h new file mode 100644 index 0000000..48f1eb7 --- /dev/null +++ b/src/include/libplacebo/tone_mapping.h @@ -0,0 +1,268 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_TONE_MAPPING_H_ +#define LIBPLACEBO_TONE_MAPPING_H_ + +#include <stddef.h> +#include <stdbool.h> + +#include <libplacebo/common.h> +#include <libplacebo/colorspace.h> + +PL_API_BEGIN + +struct pl_tone_map_params; +struct pl_tone_map_function { + const char *name; // Identifier + const char *description; // Friendly / longer name + + // This controls the type of values input/output to/from `map` + enum pl_hdr_scaling scaling; + + // The tone-mapping function itself. Iterates over all values in `lut`, and + // adapts them as needed. + // + // Note that the `params` struct fed into this function is guaranteed to + // satisfy `params->input_scaling == params->output_scaling == scaling`, + // and also obeys `params->input_max >= params->output_max`. + void (*map)(float *lut, const struct pl_tone_map_params *params); + + // Inverse tone mapping function. Optional. If absent, this tone mapping + // curve only works in the forwards direction. + // + // For this function, `params->input_max <= params->output_max`. + void (*map_inverse)(float *lut, const struct pl_tone_map_params *params); + + // Private data. Unused by libplacebo, but may be accessed by `map`. + void *priv; + + // --- Deprecated fields + const char *param_desc PL_DEPRECATED; + float param_min PL_DEPRECATED; + float param_def PL_DEPRECATED; + float param_max PL_DEPRECATED; +}; + +struct pl_tone_map_constants { + // Configures the knee point, as a ratio between the source average and + // target average (in PQ space). An adaptation of 1.0 always adapts the + // source scene average brightness to the (scaled) target average, + // while a value of 0.0 never modifies scene brightness. [0,1] + // + // Affects all methods that use the ST2094 knee point determination + // (currently ST2094-40, ST2094-10 and spline) + float knee_adaptation; + + // Configures the knee point minimum and maximum, respectively, as + // a percentage of the PQ luminance range. Provides a hard limit on the + // knee point chosen by `knee_adaptation`. + float knee_minimum; // (0, 0.5) + float knee_maximum; // (0.5, 1.0) + + // Default knee point to use in the absence of source scene average + // metadata. Normally, this is ignored in favor of picking the knee + // point as the (relative) source scene average brightness level. + float knee_default; // [knee_minimum, knee_maximum] + + // Knee point offset (for BT.2390 only). Note that a value of 0.5 is + // the spec-defined default behavior, which differs from the libplacebo + // default of 1.0. [0.5, 2] + float knee_offset; + + // For the single-pivot polynomial (spline) function, this controls the + // coefficients used to tune the slope of the curve. This tuning is designed + // to make the slope closer to 1.0 when the difference in peaks is low, + // and closer to linear when the difference between peaks is high. + float slope_tuning; // [0,10] + float slope_offset; // [0,1] + + // Contrast setting for the spline function. Higher values make the curve + // steeper (closer to `clip`), preserving midtones at the cost of losing + // shadow/highlight details, while lower values make the curve shallowed + // (closer to `linear`), preserving highlights at the cost of losing midtone + // contrast. Values above 1.0 are possible, resulting in an output with more + // contrast than the input. + float spline_contrast; // [0,1.5] + + // For the reinhard function, this specifies the local contrast coefficient + // at the display peak. Essentially, a value of 0.5 implies that the + // reference white will be about half as bright as when clipping. (0,1) + float reinhard_contrast; + + // For legacy functions (mobius, gamma) which operate on linear light, this + // directly sets the corresponding knee point. (0,1) + float linear_knee; + + // For linear methods (linear, linearlight), this controls the linear + // exposure/gain applied to the image. (0,10] + float exposure; +}; + +#define PL_TONE_MAP_CONSTANTS \ + .knee_adaptation = 0.4f, \ + .knee_minimum = 0.1f, \ + .knee_maximum = 0.8f, \ + .knee_default = 0.4f, \ + .knee_offset = 1.0f, \ + .slope_tuning = 1.5f, \ + .slope_offset = 0.2f, \ + .spline_contrast = 0.5f, \ + .reinhard_contrast = 0.5f, \ + .linear_knee = 0.3f, \ + .exposure = 1.0f, + +struct pl_tone_map_params { + // If `function` is NULL, defaults to `pl_tone_map_clip`. + const struct pl_tone_map_function *function; + + // Common constants, should be initialized to PL_TONE_MAP_CONSTANTS if + // not intending to override them further. + struct pl_tone_map_constants constants; + + // The desired input/output scaling of the tone map. If this differs from + // `function->scaling`, any required conversion will be performed. + // + // Note that to maximize LUT efficiency, it's *highly* recommended to use + // either PL_HDR_PQ or PL_HDR_SQRT as the input scaling, except when + // using `pl_tone_map_sample`. + enum pl_hdr_scaling input_scaling; + enum pl_hdr_scaling output_scaling; + + // The size of the resulting LUT. (For `pl_tone_map_generate` only) + size_t lut_size; + + // The characteristics of the input, in `input_scaling` units. + float input_min; + float input_max; + float input_avg; // or 0 if unknown + + // The desired characteristics of the output, in `output_scaling` units. + float output_min; + float output_max; + + // The input HDR metadata. Only used by a select few tone-mapping + // functions, currently only SMPTE ST2094. (Optional) + struct pl_hdr_metadata hdr; + + // --- Deprecated fields + float param PL_DEPRECATED; // see `constants` +}; + +#define pl_tone_map_params(...) (&(struct pl_tone_map_params) { __VA_ARGS__ }); + +// Note: Only does pointer equality testing on `function` +PL_API bool pl_tone_map_params_equal(const struct pl_tone_map_params *a, + const struct pl_tone_map_params *b); + +// Clamps/defaults the parameters, including input/output maximum. +PL_API void pl_tone_map_params_infer(struct pl_tone_map_params *params); + +// Returns true if the given tone mapping configuration effectively represents +// a no-op configuration. Tone mapping can be skipped in this case (although +// strictly speaking, the LUT would still clip illegal input values) +PL_API bool pl_tone_map_params_noop(const struct pl_tone_map_params *params); + +// Generate a tone-mapping LUT for a given configuration. This will always +// span the entire input range, as given by `input_min` and `input_max`. +PL_API void pl_tone_map_generate(float *out, const struct pl_tone_map_params *params); + +// Samples a tone mapping function at a single position. Note that this is less +// efficient than `pl_tone_map_generate` for generating multiple values. +// +// Ignores `params->lut_size`. +PL_API float pl_tone_map_sample(float x, const struct pl_tone_map_params *params); + +// Performs no tone-mapping, just clips out-of-range colors. Retains perfect +// color accuracy for in-range colors but completely destroys out-of-range +// information. Does not perform any black point adaptation. +PL_API extern const struct pl_tone_map_function pl_tone_map_clip; + +// EETF from SMPTE ST 2094-40 Annex B, which uses the provided OOTF based on +// Bezier curves to perform tone-mapping. The OOTF used is adjusted based on +// the ratio between the targeted and actual display peak luminances. In the +// absence of HDR10+ metadata, falls back to a simple constant bezier curve. +PL_API extern const struct pl_tone_map_function pl_tone_map_st2094_40; + +// EETF from SMPTE ST 2094-10 Annex B.2, which takes into account the input +// signal average luminance in addition to the maximum/minimum. +// +// Note: This does *not* currently include the subjective gain/offset/gamma +// controls defined in Annex B.3. (Open an issue with a valid sample file if +// you want such parameters to be respected.) +PL_API extern const struct pl_tone_map_function pl_tone_map_st2094_10; + +// EETF from the ITU-R Report BT.2390, a hermite spline roll-off with linear +// segment. +PL_API extern const struct pl_tone_map_function pl_tone_map_bt2390; + +// EETF from ITU-R Report BT.2446, method A. Can be used for both forward +// and inverse tone mapping. +PL_API extern const struct pl_tone_map_function pl_tone_map_bt2446a; + +// Simple spline consisting of two polynomials, joined by a single pivot point, +// which is tuned based on the source scene average brightness (taking into +// account dynamic metadata if available). This function can be used +// for both forward and inverse tone mapping. +PL_API extern const struct pl_tone_map_function pl_tone_map_spline; + +// Very simple non-linear curve. Named after Erik Reinhard. +PL_API extern const struct pl_tone_map_function pl_tone_map_reinhard; + +// Generalization of the reinhard tone mapping algorithm to support an +// additional linear slope near black. The name is derived from its function +// shape (ax+b)/(cx+d), which is known as a Möbius transformation. +PL_API extern const struct pl_tone_map_function pl_tone_map_mobius; + +// Piece-wise, filmic tone-mapping algorithm developed by John Hable for use in +// Uncharted 2, inspired by a similar tone-mapping algorithm used by Kodak. +// Popularized by its use in video games with HDR rendering. Preserves both +// dark and bright details very well, but comes with the drawback of changing +// the average brightness quite significantly. This is sort of similar to +// pl_tone_map_reinhard with `reinhard_contrast=0.24`. +PL_API extern const struct pl_tone_map_function pl_tone_map_hable; + +// Fits a gamma (power) function to transfer between the source and target +// color spaces, effectively resulting in a perceptual hard-knee joining two +// roughly linear sections. This preserves details at all scales, but can result +// in an image with a muted or dull appearance. +PL_API extern const struct pl_tone_map_function pl_tone_map_gamma; + +// Linearly stretches the input range to the output range, in PQ space. This +// will preserve all details accurately, but results in a significantly +// different average brightness. Can be used for inverse tone-mapping in +// addition to regular tone-mapping. +PL_API extern const struct pl_tone_map_function pl_tone_map_linear; + +// Like `pl_tone_map_linear`, but in linear light (instead of PQ). Works well +// for small range adjustments but may cause severe darkening when +// downconverting from e.g. 10k nits to SDR. +PL_API extern const struct pl_tone_map_function pl_tone_map_linear_light; + +// A list of built-in tone mapping functions, terminated by NULL +PL_API extern const struct pl_tone_map_function * const pl_tone_map_functions[]; +PL_API extern const int pl_num_tone_map_functions; // excluding trailing NULL + +// Find the tone mapping function with the given name, or NULL on failure. +PL_API const struct pl_tone_map_function *pl_find_tone_map_function(const char *name); + +// Deprecated alias, do not use +#define pl_tone_map_auto pl_tone_map_spline + +PL_API_END + +#endif // LIBPLACEBO_TONE_MAPPING_H_ diff --git a/src/include/libplacebo/utils/dav1d.h b/src/include/libplacebo/utils/dav1d.h new file mode 100644 index 0000000..ece97c5 --- /dev/null +++ b/src/include/libplacebo/utils/dav1d.h @@ -0,0 +1,129 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_DAV1D_H_ +#define LIBPLACEBO_DAV1D_H_ + +#include <libplacebo/gpu.h> +#include <libplacebo/utils/upload.h> +#include <dav1d/dav1d.h> + +#if defined(__cplusplus) && !defined(PL_DAV1D_IMPLEMENTATION) +# define PL_DAV1D_API +# define PL_DAV1D_IMPLEMENTATION 0 +# warning Remember to include this file with a PL_DAV1D_IMPLEMENTATION set to 1 in \ + C translation unit to provide implementation. Suppress this warning by \ + defining PL_DAV1D_IMPLEMENTATION to 0 in C++ files. +#elif !defined(PL_DAV1D_IMPLEMENTATION) +# define PL_DAV1D_API static inline +# define PL_DAV1D_IMPLEMENTATION 1 +#else +# define PL_DAV1D_API +#endif + +PL_API_BEGIN + +// Fill in the details of a `pl_frame` from a Dav1dPicture. This function will +// explicitly clear `out_frame`, setting all extra fields to 0. After this +// function returns, the only missing data is information related to the plane +// texture itself (`planes[N].texture`). +// +// Note: This will include all possible metadata, including HDR metadata and +// AV1 film grain data. Users should explicitly clear this out if undesired. +PL_DAV1D_API void pl_frame_from_dav1dpicture(struct pl_frame *out_frame, + const Dav1dPicture *picture); + +// Helper function to generate a `pl_color_space` struct from a Dav1dPicture. +// Useful to update the swapchain colorspace mode dynamically (e.g. for HDR). +PL_DAV1D_API void pl_swapchain_colors_from_dav1dpicture(struct pl_color_space *out_colors, + const Dav1dPicture *picture); + +struct pl_dav1d_upload_params { + // The picture to upload. Not modified unless `asynchronous` is true. + Dav1dPicture *picture; + + // If true, film grain present in `picture` will be exported to the + // `pl_frame` as well. This should be set to false unless the user has + // disabled `Dav1dSettings.apply_grain`. + bool film_grain; + + // If true, libplacebo will probe for the allocation metadata set by + // `pl_allocate_dav1dpicture`, and directly import the attached buffers + // (saving a memcpy in some cases). Has no effect if the Dav1dPicture was + // not allocated using `pl_allocate_dav1dpicture`. + // + // Note: When this is the case, `asynchronous` has no further effect - + // uploads from attached buffers are already asynchronous. + bool gpu_allocated; + + // If true, `picture` will be asynchronously uploaded and unref'd + // internally by libplacebo, and the struct passed by the user cleared to + // {0}. This is needed to avoid `memcpy` in some cases, so setting it to + // true is highly recommended wherever possible. + // + // Note: If `pl_upload_dav1dpicture` returns false, `picture` does not get + // unref'd. + bool asynchronous; +}; + +#define pl_dav1d_upload_params(...) (&(struct pl_dav1d_upload_params) { __VA_ARGS__ }) + +// Very high level helper function to take a `Dav1dPicture` and upload it to +// the GPU. Similar in spirit to `pl_upload_plane`, and the same notes apply. +// `tex` must be an array of 3 pointers of type `pl_tex`, each +// either pointing to a valid texture, or NULL. Returns whether successful. +PL_DAV1D_API bool pl_upload_dav1dpicture(pl_gpu gpu, + struct pl_frame *out_frame, pl_tex tex[3], + const struct pl_dav1d_upload_params *params); + +// Allocate a Dav1dPicture from persistently mapped buffers. This can be more +// efficient than regular Dav1dPictures, especially when using the synchronous +// `pl_upload_dav1dpicture`, or on platforms that don't support importing +// PL_HANDLE_HOST_PTR as buffers. Returns 0 or a negative DAV1D_ERR value. +// +// Note: These may only be used directly as a Dav1dPicAllocator if the `gpu` +// passed as the value of `cookie` is `pl_gpu.limits.thread_safe`. Otherwise, +// the user must manually synchronize this to ensure it runs on the correct +// thread. +PL_DAV1D_API int pl_allocate_dav1dpicture(Dav1dPicture *picture, void *gpu); +PL_DAV1D_API void pl_release_dav1dpicture(Dav1dPicture *picture, void *gpu); + +// Mapping functions for the various Dav1dColor* enums. Note that these are not +// quite 1:1, and even for values that exist in both, the semantics sometimes +// differ. Some special cases (e.g. ICtCp, or XYZ) are handled differently in +// libplacebo and libdav1d, respectively. +PL_DAV1D_API enum pl_color_system pl_system_from_dav1d(enum Dav1dMatrixCoefficients mc); +PL_DAV1D_API enum Dav1dMatrixCoefficients pl_system_to_dav1d(enum pl_color_system sys); +PL_DAV1D_API enum pl_color_levels pl_levels_from_dav1d(int color_range); +PL_DAV1D_API int pl_levels_to_dav1d(enum pl_color_levels levels); +PL_DAV1D_API enum pl_color_primaries pl_primaries_from_dav1d(enum Dav1dColorPrimaries prim); +PL_DAV1D_API enum Dav1dColorPrimaries pl_primaries_to_dav1d(enum pl_color_primaries prim); +PL_DAV1D_API enum pl_color_transfer pl_transfer_from_dav1d(enum Dav1dTransferCharacteristics trc); +PL_DAV1D_API enum Dav1dTransferCharacteristics pl_transfer_to_dav1d(enum pl_color_transfer trc); +PL_DAV1D_API enum pl_chroma_location pl_chroma_from_dav1d(enum Dav1dChromaSamplePosition loc); +PL_DAV1D_API enum Dav1dChromaSamplePosition pl_chroma_to_dav1d(enum pl_chroma_location loc); + + +// Actual implementation, included as part of this header to avoid having +// a compile-time dependency on libdav1d. +#if PL_DAV1D_IMPLEMENTATION +# include <libplacebo/utils/dav1d_internal.h> +#endif + +PL_API_END + +#endif // LIBPLACEBO_DAV1D_H_ diff --git a/src/include/libplacebo/utils/dav1d_internal.h b/src/include/libplacebo/utils/dav1d_internal.h new file mode 100644 index 0000000..2e0512a --- /dev/null +++ b/src/include/libplacebo/utils/dav1d_internal.h @@ -0,0 +1,613 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_DAV1D_H_ +#error This header should be included as part of <libplacebo/utils/dav1d.h> +#elif defined(__cplusplus) +#error This header cannot be included from C++ define PL_DAV1D_IMPLEMENTATION appropriately +#else + +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +PL_DAV1D_API enum pl_color_system pl_system_from_dav1d(enum Dav1dMatrixCoefficients mc) +{ + switch (mc) { + case DAV1D_MC_IDENTITY: return PL_COLOR_SYSTEM_RGB; // or XYZ (unlikely) + case DAV1D_MC_BT709: return PL_COLOR_SYSTEM_BT_709; + case DAV1D_MC_UNKNOWN: return PL_COLOR_SYSTEM_UNKNOWN; + case DAV1D_MC_FCC: return PL_COLOR_SYSTEM_UNKNOWN; // missing + case DAV1D_MC_BT470BG: return PL_COLOR_SYSTEM_BT_601; + case DAV1D_MC_BT601: return PL_COLOR_SYSTEM_BT_601; + case DAV1D_MC_SMPTE240: return PL_COLOR_SYSTEM_SMPTE_240M; + case DAV1D_MC_SMPTE_YCGCO: return PL_COLOR_SYSTEM_YCGCO; + case DAV1D_MC_BT2020_NCL: return PL_COLOR_SYSTEM_BT_2020_NC; + case DAV1D_MC_BT2020_CL: return PL_COLOR_SYSTEM_BT_2020_C; + case DAV1D_MC_SMPTE2085: return PL_COLOR_SYSTEM_UNKNOWN; // missing + case DAV1D_MC_CHROMAT_NCL: return PL_COLOR_SYSTEM_UNKNOWN; // missing + case DAV1D_MC_CHROMAT_CL: return PL_COLOR_SYSTEM_UNKNOWN; // missing + // Note: this colorspace is confused between PQ and HLG, which dav1d + // requires inferring from other sources, but libplacebo makes + // explicit. Default to PQ as it's the more common scenario. + case DAV1D_MC_ICTCP: return PL_COLOR_SYSTEM_BT_2100_PQ; + case DAV1D_MC_RESERVED: abort(); + } + + return PL_COLOR_SYSTEM_UNKNOWN; +} + +PL_DAV1D_API enum Dav1dMatrixCoefficients pl_system_to_dav1d(enum pl_color_system sys) +{ + switch (sys) { + case PL_COLOR_SYSTEM_UNKNOWN: return DAV1D_MC_UNKNOWN; + case PL_COLOR_SYSTEM_BT_601: return DAV1D_MC_BT601; + case PL_COLOR_SYSTEM_BT_709: return DAV1D_MC_BT709; + case PL_COLOR_SYSTEM_SMPTE_240M: return DAV1D_MC_SMPTE240; + case PL_COLOR_SYSTEM_BT_2020_NC: return DAV1D_MC_BT2020_NCL; + case PL_COLOR_SYSTEM_BT_2020_C: return DAV1D_MC_BT2020_CL; + case PL_COLOR_SYSTEM_BT_2100_PQ: return DAV1D_MC_ICTCP; + case PL_COLOR_SYSTEM_BT_2100_HLG: return DAV1D_MC_ICTCP; + case PL_COLOR_SYSTEM_DOLBYVISION: return DAV1D_MC_UNKNOWN; // missing + case PL_COLOR_SYSTEM_YCGCO: return DAV1D_MC_SMPTE_YCGCO; + case PL_COLOR_SYSTEM_RGB: return DAV1D_MC_IDENTITY; + case PL_COLOR_SYSTEM_XYZ: return DAV1D_MC_IDENTITY; + case PL_COLOR_SYSTEM_COUNT: abort(); + } + + return DAV1D_MC_UNKNOWN; +} + +PL_DAV1D_API enum pl_color_levels pl_levels_from_dav1d(int color_range) +{ + return color_range ? PL_COLOR_LEVELS_FULL : PL_COLOR_LEVELS_LIMITED; +} + +PL_DAV1D_API int pl_levels_to_dav1d(enum pl_color_levels levels) +{ + return levels == PL_COLOR_LEVELS_FULL; +} + +PL_DAV1D_API enum pl_color_primaries pl_primaries_from_dav1d(enum Dav1dColorPrimaries prim) +{ + switch (prim) { + case DAV1D_COLOR_PRI_BT709: return PL_COLOR_PRIM_BT_709; + case DAV1D_COLOR_PRI_UNKNOWN: return PL_COLOR_PRIM_UNKNOWN; + case DAV1D_COLOR_PRI_RESERVED: return PL_COLOR_PRIM_UNKNOWN; + case DAV1D_COLOR_PRI_BT470M: return PL_COLOR_PRIM_BT_470M; + case DAV1D_COLOR_PRI_BT470BG: return PL_COLOR_PRIM_BT_601_625; + case DAV1D_COLOR_PRI_BT601: return PL_COLOR_PRIM_BT_601_525; + case DAV1D_COLOR_PRI_SMPTE240: return PL_COLOR_PRIM_BT_601_525; + case DAV1D_COLOR_PRI_FILM: return PL_COLOR_PRIM_FILM_C; + case DAV1D_COLOR_PRI_BT2020: return PL_COLOR_PRIM_BT_2020; + case DAV1D_COLOR_PRI_XYZ: return PL_COLOR_PRIM_UNKNOWN; + case DAV1D_COLOR_PRI_SMPTE431: return PL_COLOR_PRIM_DCI_P3; + case DAV1D_COLOR_PRI_SMPTE432: return PL_COLOR_PRIM_DISPLAY_P3; + case DAV1D_COLOR_PRI_EBU3213: return PL_COLOR_PRIM_EBU_3213; + } + + return PL_COLOR_PRIM_UNKNOWN; +} + +PL_DAV1D_API enum Dav1dColorPrimaries pl_primaries_to_dav1d(enum pl_color_primaries prim) +{ + switch (prim) { + case PL_COLOR_PRIM_UNKNOWN: return DAV1D_COLOR_PRI_UNKNOWN; + case PL_COLOR_PRIM_BT_601_525: return DAV1D_COLOR_PRI_BT601; + case PL_COLOR_PRIM_BT_601_625: return DAV1D_COLOR_PRI_BT470BG; + case PL_COLOR_PRIM_BT_709: return DAV1D_COLOR_PRI_BT709; + case PL_COLOR_PRIM_BT_470M: return DAV1D_COLOR_PRI_BT470M; + case PL_COLOR_PRIM_EBU_3213: return DAV1D_COLOR_PRI_EBU3213; + case PL_COLOR_PRIM_BT_2020: return DAV1D_COLOR_PRI_BT2020; + case PL_COLOR_PRIM_APPLE: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_ADOBE: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_PRO_PHOTO: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_CIE_1931: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_DCI_P3: return DAV1D_COLOR_PRI_SMPTE431; + case PL_COLOR_PRIM_DISPLAY_P3: return DAV1D_COLOR_PRI_SMPTE432; + case PL_COLOR_PRIM_V_GAMUT: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_S_GAMUT: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_FILM_C: return DAV1D_COLOR_PRI_FILM; + case PL_COLOR_PRIM_ACES_AP0: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_ACES_AP1: return DAV1D_COLOR_PRI_UNKNOWN; // missing + case PL_COLOR_PRIM_COUNT: abort(); + } + + return DAV1D_COLOR_PRI_UNKNOWN; +} + +PL_DAV1D_API enum pl_color_transfer pl_transfer_from_dav1d(enum Dav1dTransferCharacteristics trc) +{ + switch (trc) { + case DAV1D_TRC_BT709: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case DAV1D_TRC_UNKNOWN: return PL_COLOR_TRC_UNKNOWN; + case DAV1D_TRC_BT470M: return PL_COLOR_TRC_GAMMA22; + case DAV1D_TRC_BT470BG: return PL_COLOR_TRC_GAMMA28; + case DAV1D_TRC_BT601: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case DAV1D_TRC_SMPTE240: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case DAV1D_TRC_LINEAR: return PL_COLOR_TRC_LINEAR; + case DAV1D_TRC_LOG100: return PL_COLOR_TRC_UNKNOWN; // missing + case DAV1D_TRC_LOG100_SQRT10: return PL_COLOR_TRC_UNKNOWN; // missing + case DAV1D_TRC_IEC61966: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case DAV1D_TRC_BT1361: return PL_COLOR_TRC_BT_1886; // ETOF != OETF + case DAV1D_TRC_SRGB: return PL_COLOR_TRC_SRGB; + case DAV1D_TRC_BT2020_10BIT: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case DAV1D_TRC_BT2020_12BIT: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case DAV1D_TRC_SMPTE2084: return PL_COLOR_TRC_PQ; + case DAV1D_TRC_SMPTE428: return PL_COLOR_TRC_ST428; + case DAV1D_TRC_HLG: return PL_COLOR_TRC_HLG; + case DAV1D_TRC_RESERVED: abort(); + } + + return PL_COLOR_TRC_UNKNOWN; +} + +PL_DAV1D_API enum Dav1dTransferCharacteristics pl_transfer_to_dav1d(enum pl_color_transfer trc) +{ + switch (trc) { + case PL_COLOR_TRC_UNKNOWN: return DAV1D_TRC_UNKNOWN; + case PL_COLOR_TRC_BT_1886: return DAV1D_TRC_BT709; // EOTF != OETF + case PL_COLOR_TRC_SRGB: return DAV1D_TRC_SRGB; + case PL_COLOR_TRC_LINEAR: return DAV1D_TRC_LINEAR; + case PL_COLOR_TRC_GAMMA18: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_GAMMA20: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_GAMMA22: return DAV1D_TRC_BT470M; + case PL_COLOR_TRC_GAMMA24: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_GAMMA26: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_GAMMA28: return DAV1D_TRC_BT470BG; + case PL_COLOR_TRC_ST428: return DAV1D_TRC_SMPTE428; + case PL_COLOR_TRC_PRO_PHOTO: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_PQ: return DAV1D_TRC_SMPTE2084; + case PL_COLOR_TRC_HLG: return DAV1D_TRC_HLG; + case PL_COLOR_TRC_V_LOG: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_S_LOG1: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_S_LOG2: return DAV1D_TRC_UNKNOWN; // missing + case PL_COLOR_TRC_COUNT: abort(); + } + + return DAV1D_TRC_UNKNOWN; +} + +PL_DAV1D_API enum pl_chroma_location pl_chroma_from_dav1d(enum Dav1dChromaSamplePosition loc) +{ + switch (loc) { + case DAV1D_CHR_UNKNOWN: return PL_CHROMA_UNKNOWN; + case DAV1D_CHR_VERTICAL: return PL_CHROMA_LEFT; + case DAV1D_CHR_COLOCATED: return PL_CHROMA_TOP_LEFT; + } + + return PL_CHROMA_UNKNOWN; +} + +PL_DAV1D_API enum Dav1dChromaSamplePosition pl_chroma_to_dav1d(enum pl_chroma_location loc) +{ + switch (loc) { + case PL_CHROMA_UNKNOWN: return DAV1D_CHR_UNKNOWN; + case PL_CHROMA_LEFT: return DAV1D_CHR_VERTICAL; + case PL_CHROMA_CENTER: return DAV1D_CHR_UNKNOWN; // missing + case PL_CHROMA_TOP_LEFT: return DAV1D_CHR_COLOCATED; + case PL_CHROMA_TOP_CENTER: return DAV1D_CHR_UNKNOWN; // missing + case PL_CHROMA_BOTTOM_LEFT: return DAV1D_CHR_UNKNOWN; // missing + case PL_CHROMA_BOTTOM_CENTER: return DAV1D_CHR_UNKNOWN; // missing + case PL_CHROMA_COUNT: abort(); + } + + return DAV1D_CHR_UNKNOWN; +} + +static inline float pl_fixed24_8(uint32_t n) +{ + return (float) n / (1 << 8); +} + +static inline float pl_fixed18_14(uint32_t n) +{ + return (float) n / (1 << 14); +} + +static inline float pl_fixed0_16(uint16_t n) +{ + return (float) n / (1 << 16); +} + +// Align to a power of 2 +#define PL_ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +PL_DAV1D_API void pl_frame_from_dav1dpicture(struct pl_frame *out, + const Dav1dPicture *picture) +{ + const Dav1dSequenceHeader *seq_hdr = picture->seq_hdr; + int num_planes; + switch (picture->p.layout) { + case DAV1D_PIXEL_LAYOUT_I400: + num_planes = 1; + break; + case DAV1D_PIXEL_LAYOUT_I420: + case DAV1D_PIXEL_LAYOUT_I422: + case DAV1D_PIXEL_LAYOUT_I444: + num_planes = 3; + break; + default: abort(); + } + + *out = (struct pl_frame) { + .num_planes = num_planes, + .planes = { + // Components are always in order, which makes things easy + { + .components = 1, + .component_mapping = {0}, + }, { + .components = 1, + .component_mapping = {1}, + }, { + .components = 1, + .component_mapping = {2}, + }, + }, + .crop = { + 0, 0, picture->p.w, picture->p.h, + }, + .color = { + .primaries = pl_primaries_from_dav1d(seq_hdr->pri), + .transfer = pl_transfer_from_dav1d(seq_hdr->trc), + }, + .repr = { + .sys = pl_system_from_dav1d(seq_hdr->mtrx), + .levels = pl_levels_from_dav1d(seq_hdr->color_range), + .bits = { + .sample_depth = PL_ALIGN2(picture->p.bpc, 8), + .color_depth = picture->p.bpc, + }, + }, + }; + + if (seq_hdr->mtrx == DAV1D_MC_ICTCP && seq_hdr->trc == DAV1D_TRC_HLG) { + + // dav1d makes no distinction between PQ and HLG ICtCp, so we need + // to manually fix it in the case that we have HLG ICtCp data. + out->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG; + + } else if (seq_hdr->mtrx == DAV1D_MC_IDENTITY && + seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) + { + + // dav1d handles this as a special case, but doesn't provide an + // explicit flag for it either, so we have to resort to this ugly hack, + // even though CIE 1931 RGB *is* a valid thing in principle! + out->repr.sys= PL_COLOR_SYSTEM_XYZ; + + } else if (!out->repr.sys) { + + // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one + out->repr.sys = pl_color_system_guess_ycbcr(picture->p.w, picture->p.h); + } + + const Dav1dContentLightLevel *cll = picture->content_light; + if (cll) { + out->color.hdr.max_cll = cll->max_content_light_level; + out->color.hdr.max_fall = cll->max_frame_average_light_level; + } + + // This overrides the CLL values above, if both are present + const Dav1dMasteringDisplay *md = picture->mastering_display; + if (md) { + out->color.hdr.max_luma = pl_fixed24_8(md->max_luminance); + out->color.hdr.min_luma = pl_fixed18_14(md->min_luminance); + out->color.hdr.prim = (struct pl_raw_primaries) { + .red.x = pl_fixed0_16(md->primaries[0][0]), + .red.y = pl_fixed0_16(md->primaries[0][1]), + .green.x = pl_fixed0_16(md->primaries[1][0]), + .green.y = pl_fixed0_16(md->primaries[1][1]), + .blue.x = pl_fixed0_16(md->primaries[2][0]), + .blue.y = pl_fixed0_16(md->primaries[2][1]), + .white.x = pl_fixed0_16(md->white_point[0]), + .white.y = pl_fixed0_16(md->white_point[1]), + }; + } + + if (picture->frame_hdr->film_grain.present) { + const Dav1dFilmGrainData *fg = &picture->frame_hdr->film_grain.data; + out->film_grain = (struct pl_film_grain_data) { + .type = PL_FILM_GRAIN_AV1, + .seed = fg->seed, + .params.av1 = { + .num_points_y = fg->num_y_points, + .chroma_scaling_from_luma = fg->chroma_scaling_from_luma, + .num_points_uv = { fg->num_uv_points[0], fg->num_uv_points[1] }, + .scaling_shift = fg->scaling_shift, + .ar_coeff_lag = fg->ar_coeff_lag, + .ar_coeff_shift = (int) fg->ar_coeff_shift, + .grain_scale_shift = fg->grain_scale_shift, + .uv_mult = { fg->uv_mult[0], fg->uv_mult[1] }, + .uv_mult_luma = { fg->uv_luma_mult[0], fg->uv_luma_mult[1] }, + .uv_offset = { fg->uv_offset[0], fg->uv_offset[1] }, + .overlap = fg->overlap_flag, + }, + }; + + struct pl_av1_grain_data *av1 = &out->film_grain.params.av1; + memcpy(av1->points_y, fg->y_points, sizeof(av1->points_y)); + memcpy(av1->points_uv, fg->uv_points, sizeof(av1->points_uv)); + memcpy(av1->ar_coeffs_y, fg->ar_coeffs_y, sizeof(av1->ar_coeffs_y)); + memcpy(av1->ar_coeffs_uv[0], fg->ar_coeffs_uv[0], sizeof(av1->ar_coeffs_uv[0])); + memcpy(av1->ar_coeffs_uv[1], fg->ar_coeffs_uv[1], sizeof(av1->ar_coeffs_uv[1])); + } + + switch (picture->p.layout) { + case DAV1D_PIXEL_LAYOUT_I400: + case DAV1D_PIXEL_LAYOUT_I444: + break; + case DAV1D_PIXEL_LAYOUT_I420: + case DAV1D_PIXEL_LAYOUT_I422: + // Only set the chroma location for definitely subsampled images + pl_frame_set_chroma_location(out, pl_chroma_from_dav1d(seq_hdr->chr)); + break; + } +} + +PL_DAV1D_API void pl_swapchain_colors_from_dav1dpicture(struct pl_swapchain_colors *out_colors, + const Dav1dPicture *picture) +{ + struct pl_frame frame; + pl_frame_from_dav1dpicture(&frame, picture); + + *out_colors = (struct pl_swapchain_colors) { + .primaries = frame.color.primaries, + .transfer = frame.color.transfer, + }; + + const Dav1dContentLightLevel *cll = picture->content_light; + if (cll) { + out_colors->hdr.max_cll = cll->max_content_light_level; + out_colors->hdr.max_fall = cll->max_frame_average_light_level; + } + + const Dav1dMasteringDisplay *md = picture->mastering_display; + if (md) { + out_colors->hdr.min_luma = pl_fixed18_14(md->min_luminance); + out_colors->hdr.max_luma = pl_fixed24_8(md->max_luminance); + out_colors->hdr.prim.red.x = pl_fixed0_16(md->primaries[0][0]); + out_colors->hdr.prim.red.y = pl_fixed0_16(md->primaries[0][1]); + out_colors->hdr.prim.green.x = pl_fixed0_16(md->primaries[1][0]); + out_colors->hdr.prim.green.y = pl_fixed0_16(md->primaries[1][1]); + out_colors->hdr.prim.blue.x = pl_fixed0_16(md->primaries[2][0]); + out_colors->hdr.prim.blue.y = pl_fixed0_16(md->primaries[2][1]); + out_colors->hdr.prim.white.x = pl_fixed0_16(md->white_point[0]); + out_colors->hdr.prim.white.y = pl_fixed0_16(md->white_point[1]); + } +} + +#define PL_MAGIC0 0x2c2a1269 +#define PL_MAGIC1 0xc6d02577 + +struct pl_dav1dalloc { + uint32_t magic[2]; + pl_gpu gpu; + pl_buf buf; +}; + +struct pl_dav1dref { + Dav1dPicture pic; + uint8_t count; +}; + +static void pl_dav1dpicture_unref(void *priv) +{ + struct pl_dav1dref *ref = priv; + if (--ref->count == 0) { + dav1d_picture_unref(&ref->pic); + free(ref); + } +} + +PL_DAV1D_API bool pl_upload_dav1dpicture(pl_gpu gpu, + struct pl_frame *out, + pl_tex tex[3], + const struct pl_dav1d_upload_params *params) +{ + Dav1dPicture *pic = params->picture; + pl_frame_from_dav1dpicture(out, pic); + if (!params->film_grain) + out->film_grain.type = PL_FILM_GRAIN_NONE; + + const int bytes = (pic->p.bpc + 7) / 8; // rounded up + int sub_x = 0, sub_y = 0; + switch (pic->p.layout) { + case DAV1D_PIXEL_LAYOUT_I400: + case DAV1D_PIXEL_LAYOUT_I444: + break; + case DAV1D_PIXEL_LAYOUT_I420: + sub_x = sub_y = 1; + break; + case DAV1D_PIXEL_LAYOUT_I422: + sub_x = 1; + break; + } + + struct pl_plane_data data[3] = { + { + // Y plane + .type = PL_FMT_UNORM, + .width = pic->p.w, + .height = pic->p.h, + .pixel_stride = bytes, + .component_size = {bytes * 8}, + .component_map = {0}, + }, { + // U plane + .type = PL_FMT_UNORM, + .width = pic->p.w >> sub_x, + .height = pic->p.h >> sub_y, + .pixel_stride = bytes, + .component_size = {bytes * 8}, + .component_map = {1}, + }, { + // V plane + .type = PL_FMT_UNORM, + .width = pic->p.w >> sub_x, + .height = pic->p.h >> sub_y, + .pixel_stride = bytes, + .component_size = {bytes * 8}, + .component_map = {2}, + }, + }; + + pl_buf buf = NULL; + struct pl_dav1dalloc *alloc = params->gpu_allocated ? pic->allocator_data : NULL; + struct pl_dav1dref *ref = NULL; + + if (alloc && alloc->magic[0] == PL_MAGIC0 && alloc->magic[1] == PL_MAGIC1) { + // Re-use pre-allocated buffers directly + assert(alloc->gpu == gpu); + buf = alloc->buf; + } else if (params->asynchronous && gpu->limits.callbacks) { + ref = malloc(sizeof(*ref)); + if (!ref) + return false; + memcpy(&ref->pic, pic, sizeof(Dav1dPicture)); + ref->count = out->num_planes; + } + + for (int p = 0; p < out->num_planes; p++) { + ptrdiff_t stride = p > 0 ? pic->stride[1] : pic->stride[0]; + if (stride < 0) { + data[p].pixels = (uint8_t *) pic->data[p] + stride * (data[p].height - 1); + data[p].row_stride = -stride; + out->planes[p].flipped = true; + } else { + data[p].pixels = pic->data[p]; + data[p].row_stride = stride; + } + + if (buf) { + data[p].buf = buf; + data[p].buf_offset = (uintptr_t) data[p].pixels - (uintptr_t) buf->data; + data[p].pixels = NULL; + } else if (ref) { + data[p].priv = ref; + data[p].callback = pl_dav1dpicture_unref; + } + + if (!pl_upload_plane(gpu, &out->planes[p], &tex[p], &data[p])) { + free(ref); + return false; + } + } + + if (params->asynchronous) { + if (ref) { + *pic = (Dav1dPicture) {0}; + } else { + dav1d_picture_unref(pic); + } + } + + return true; +} + +PL_DAV1D_API int pl_allocate_dav1dpicture(Dav1dPicture *p, void *cookie) +{ + pl_gpu gpu = cookie; + if (!gpu->limits.max_mapped_size || !gpu->limits.host_cached || + !gpu->limits.buf_transfer) + { + return DAV1D_ERR(ENOTSUP); + } + + // Copied from dav1d_default_picture_alloc + const int hbd = p->p.bpc > 8; + const int aligned_w = PL_ALIGN2(p->p.w, 128); + const int aligned_h = PL_ALIGN2(p->p.h, 128); + const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; + const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; + p->stride[0] = aligned_w << hbd; + p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0; + + // Align strides up to multiples of the GPU performance hints + p->stride[0] = PL_ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_pitch); + p->stride[1] = PL_ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_pitch); + + // Aligning offsets to 4 also implicitly aligns to the texel alignment (1 or 2) + size_t off_align = PL_ALIGN2(gpu->limits.align_tex_xfer_offset, 4); + const size_t y_sz = PL_ALIGN2(p->stride[0] * aligned_h, off_align); + const size_t uv_sz = PL_ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align); + + // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment, + // even in the case that the driver gives us insane alignments + const size_t pic_size = y_sz + 2 * uv_sz; + const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4; + + // Validate size limitations + if (total_size > gpu->limits.max_mapped_size) + return DAV1D_ERR(ENOMEM); + + pl_buf buf = pl_buf_create(gpu, pl_buf_params( + .size = total_size, + .host_mapped = true, + .memory_type = PL_BUF_MEM_HOST, + )); + + if (!buf) + return DAV1D_ERR(ENOMEM); + + struct pl_dav1dalloc *alloc = malloc(sizeof(struct pl_dav1dalloc)); + if (!alloc) { + pl_buf_destroy(gpu, &buf); + return DAV1D_ERR(ENOMEM); + } + + *alloc = (struct pl_dav1dalloc) { + .magic = { PL_MAGIC0, PL_MAGIC1 }, + .gpu = gpu, + .buf = buf, + }; + + assert(buf->data); + uintptr_t base = (uintptr_t) buf->data, data[3]; + data[0] = PL_ALIGN2(base, DAV1D_PICTURE_ALIGNMENT); + data[1] = PL_ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT); + data[2] = PL_ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT); + + p->allocator_data = alloc; + p->data[0] = (void *) data[0]; + p->data[1] = (void *) data[1]; + p->data[2] = (void *) data[2]; + return 0; +} + +PL_DAV1D_API void pl_release_dav1dpicture(Dav1dPicture *p, void *cookie) +{ + struct pl_dav1dalloc *alloc = p->allocator_data; + if (!alloc) + return; + + assert(alloc->magic[0] == PL_MAGIC0); + assert(alloc->magic[1] == PL_MAGIC1); + assert(alloc->gpu == cookie); + pl_buf_destroy(alloc->gpu, &alloc->buf); + free(alloc); + + p->data[0] = p->data[1] = p->data[2] = p->allocator_data = NULL; +} + +#undef PL_ALIGN2 +#undef PL_MAGIC0 +#undef PL_MAGIC1 + +#endif // LIBPLACEBO_DAV1D_H_ diff --git a/src/include/libplacebo/utils/dolbyvision.h b/src/include/libplacebo/utils/dolbyvision.h new file mode 100644 index 0000000..6d4d72e --- /dev/null +++ b/src/include/libplacebo/utils/dolbyvision.h @@ -0,0 +1,34 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_DOLBYVISION_H_ +#define LIBPLACEBO_DOLBYVISION_H_ + +#include <libplacebo/colorspace.h> + +PL_API_BEGIN + +// Parses the Dolby Vision RPU, and sets the `pl_hdr_metadata` dynamic +// brightness metadata fields accordingly. +// +// Note: requires `PL_HAVE_LIBDOVI` to be defined, no-op otherwise. +PL_API void pl_hdr_metadata_from_dovi_rpu(struct pl_hdr_metadata *out, + const uint8_t *buf, size_t size); + +PL_API_END + +#endif // LIBPLACEBO_DOLBYVISION_H_ diff --git a/src/include/libplacebo/utils/frame_queue.h b/src/include/libplacebo/utils/frame_queue.h new file mode 100644 index 0000000..2a9c90c --- /dev/null +++ b/src/include/libplacebo/utils/frame_queue.h @@ -0,0 +1,230 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_FRAME_QUEUE_H +#define LIBPLACEBO_FRAME_QUEUE_H + +#include <libplacebo/renderer.h> +#include <libplacebo/shaders/deinterlacing.h> + +PL_API_BEGIN + +// An abstraction layer for automatically turning a conceptual stream of +// (frame, pts) pairs, as emitted by a decoder or filter graph, into a +// `pl_frame_mix` suitable for `pl_render_image_mix`. +// +// This API ensures that minimal work is performed (e.g. only mapping frames +// that are actually required), while also satisfying the requirements +// of any configured frame mixer. +// +// Thread-safety: Safe +typedef struct pl_queue_t *pl_queue; + +enum pl_queue_status { + PL_QUEUE_OK, // success + PL_QUEUE_EOF, // no more frames are available + PL_QUEUE_MORE, // more frames needed, but not (yet) available + PL_QUEUE_ERR = -1, // some unknown error occurred while retrieving frames +}; + +struct pl_source_frame { + // The frame's presentation timestamp, in seconds relative to the first + // frame. These must be monotonically increasing for subsequent frames. + // To implement a discontinuous jump, users must explicitly reset the + // frame queue with `pl_queue_reset` and restart from PTS 0.0. + double pts; + + // The frame's duration. This is not needed in normal scenarios, as the + // FPS can be inferred from the `pts` values themselves. Providing it + // only helps initialize the value for initial frames, which can smooth + // out the interpolation weights. Its use is also highly recommended + // when displaying interlaced frames. (Optional) + float duration; + + // If set to something other than PL_FIELD_NONE, this source frame is + // marked as interlaced. It will be split up into two separate frames + // internally, and exported to the resulting `pl_frame_mix` as a pair of + // fields, referencing the corresponding previous and next frames. The + // first field will have the same PTS as `pts`, and the second field will + // be inserted at the timestamp `pts + duration/2`. + // + // Note: As a result of FPS estimates being unreliable around streams with + // mixed FPS (or when mixing interlaced and progressive frames), it's + // highly recommended to always specify a valid `duration` for interlaced + // frames. + enum pl_field first_field; + + // Abstract frame data itself. To allow mapping frames only when they're + // actually needed, frames use a lazy representation. The provided + // callbacks will be invoked to interface with it. + void *frame_data; + + // This will be called to map the frame to the GPU, only if needed. + // + // `tex` is a pointer to an array of 4 texture objects (or NULL), which + // *may* serve as backing storage for the texture being mapped. These are + // intended to be recreated by `map`, e.g. using `pl_tex_recreate` or + // `pl_upload_plane` as appropriate. They will be managed internally by + // `pl_queue` and destroyed at some unspecified future point in time. + // + // Note: If `map` fails, it will not be retried, nor will `discard` be run. + // The user should clean up state in this case. + bool (*map)(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src, + struct pl_frame *out_frame); + + // If present, this will be called on frames that are done being used by + // `pl_queue`. This may be useful to e.g. unmap textures backed by external + // APIs such as hardware decoders. (Optional) + void (*unmap)(pl_gpu gpu, struct pl_frame *frame, const struct pl_source_frame *src); + + // This function will be called for frames that are deemed unnecessary + // (e.g. never became visible) and should instead be cleanly freed. + // (Optional) + void (*discard)(const struct pl_source_frame *src); +}; + +// Create a new, empty frame queue. +// +// It's highly recommended to fully render a single frame with `pts == 0.0`, +// and flush the GPU pipeline with `pl_gpu_finish`, prior to starting the timed +// playback loop. +PL_API pl_queue pl_queue_create(pl_gpu gpu); +PL_API void pl_queue_destroy(pl_queue *queue); + +// Explicitly clear the queue. This is essentially equivalent to destroying +// and recreating the queue, but preserves any internal memory allocations. +// +// Note: Calling `pl_queue_reset` may block, if another thread is currently +// blocked on a different `pl_queue_*` call. +PL_API void pl_queue_reset(pl_queue queue); + +// Explicitly push a frame. This is an alternative way to feed the frame queue +// with incoming frames, the other method being the asynchronous callback +// specified as `pl_queue_params.get_frame`. Both methods may be used +// simultaneously, although providing `get_frame` is recommended since it +// avoids the risk of the queue underrunning. +// +// When no more frames are available, call this function with `frame == NULL` +// to indicate EOF and begin draining the frame queue. +PL_API void pl_queue_push(pl_queue queue, const struct pl_source_frame *frame); + +// Variant of `pl_queue_push` that blocks while the queue is judged +// (internally) to be "too full". This is useful for asynchronous decoder loops +// in order to prevent the queue from exhausting available RAM if frames are +// decoded significantly faster than they're displayed. +// +// The given `timeout` parameter specifies how long to wait before giving up, +// in nanoseconds. Returns false if this timeout was reached. +PL_API bool pl_queue_push_block(pl_queue queue, uint64_t timeout, + const struct pl_source_frame *frame); + +struct pl_queue_params { + // The PTS of the frame that will be rendered. This should be set to the + // timestamp (in seconds) of the next vsync, relative to the initial frame. + // + // These must be monotonically increasing. To implement a discontinuous + // jump, users must explicitly reset the frame queue with `pl_queue_reset` + // and restart from PTS 0.0. + double pts; + + // The radius of the configured mixer. This should be set to the value + // as returned by `pl_frame_mix_radius`. + float radius; + + // The estimated duration of a vsync, in seconds. This will only be used as + // a hint, the true value will be estimated by comparing `pts` timestamps + // between calls to `pl_queue_update`. (Optional) + float vsync_duration; + + // If the difference between the (estimated) vsync duration and the + // (measured) frame duration is smaller than this threshold, silently + // disable interpolation and switch to ZOH semantics instead. + // + // For example, a value of 0.01 allows the FPS to differ by up to 1% + // without being interpolated. Note that this will result in a continuous + // phase drift unless also compensated for by the user, which will + // eventually resulted in a dropped or duplicated frame. (Though this can + // be preferable to seeing that same phase drift result in a temporally + // smeared image) + float interpolation_threshold; + + // Specifies how long `pl_queue_update` will wait for frames to become + // available, in nanoseconds, before giving up and returning with + // QUEUE_MORE. + // + // If `get_frame` is provided, this value is ignored by `pl_queue` and + // should instead be interpreted by the provided callback. + uint64_t timeout; + + // This callback will be used to pull new frames from the decoder. It may + // block if needed. The user is responsible for setting appropriate time + // limits and/or returning and interpreting QUEUE_MORE as sensible. + // + // Providing this callback is entirely optional. Users can instead choose + // to manually feed the frame queue with new frames using `pl_queue_push`. + enum pl_queue_status (*get_frame)(struct pl_source_frame *out_frame, + const struct pl_queue_params *params); + void *priv; +}; + +#define pl_queue_params(...) (&(struct pl_queue_params) { __VA_ARGS__ }) + +// Advance the frame queue's internal state to the target timestamp. Any frames +// which are no longer needed (i.e. too far in the past) are automatically +// unmapped and evicted. Any future frames which are needed to fill the queue +// must either have been pushed in advance, or will be requested using the +// provided `get_frame` callback. If you call this on `out_mix == NULL`, the +// queue state will advance, but no frames will be mapped. +// +// This function may return with PL_QUEUE_MORE, in which case the user may wish +// to ensure more frames are available and then re-run this function with the +// same parameters. In this case, `out_mix` is still written to, but it may be +// incomplete (or even contain no frames at all). Additionally, when the source +// contains interlaced frames (see `pl_source_frame.first_field`), this +// function may return with PL_QUEUE_MORE if a frame is missing references to +// a future frame. +// +// The resulting mix of frames in `out_mix` will represent the neighbourhood of +// the target timestamp, and can be passed to `pl_render_image_mix` as-is. +// +// Note: `out_mix` will only remain valid until the next call to +// `pl_queue_update` or `pl_queue_reset`. +PL_API enum pl_queue_status pl_queue_update(pl_queue queue, struct pl_frame_mix *out_mix, + const struct pl_queue_params *params); + +// Returns a pl_queue's internal estimates for FPS and VPS (vsyncs per second). +// Returns 0.0 if no estimate is available. +PL_API float pl_queue_estimate_fps(pl_queue queue); +PL_API float pl_queue_estimate_vps(pl_queue queue); + +// Returns the number of frames currently contained in a pl_queue. +PL_API int pl_queue_num_frames(pl_queue queue); + +// Inspect the contents of the Nth queued frame. Returns false if `idx` is +// out of range. +// +// Warning: No guarantee is made to ensure validity of `out->frame_data` +// after this call. In particular, pl_queue_* calls made from another thread +// may call `discard()` on the frame in question. The user bears responsibility +// to avoid accessing `out->frame_data` in a multi-threaded scenario unless +// an external guarantee can be made that the frame won't be dequeued until +// it is done being used by the user. +PL_API bool pl_queue_peek(pl_queue queue, int idx, struct pl_source_frame *out); + +PL_API_END + +#endif // LIBPLACEBO_FRAME_QUEUE_H diff --git a/src/include/libplacebo/utils/libav.h b/src/include/libplacebo/utils/libav.h new file mode 100644 index 0000000..91f3dd8 --- /dev/null +++ b/src/include/libplacebo/utils/libav.h @@ -0,0 +1,284 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_LIBAV_H_ +#define LIBPLACEBO_LIBAV_H_ + +#include <libplacebo/config.h> +#include <libplacebo/gpu.h> +#include <libplacebo/shaders/deinterlacing.h> +#include <libplacebo/utils/upload.h> + +#if defined(__cplusplus) && !defined(PL_LIBAV_IMPLEMENTATION) +# define PL_LIBAV_API +# define PL_LIBAV_IMPLEMENTATION 0 +# warning Remember to include this file with a PL_LIBAV_IMPLEMENTATION set to 1 in \ + C translation unit to provide implementation. Suppress this warning by \ + defining PL_LIBAV_IMPLEMENTATION to 0 in C++ files. +#elif !defined(PL_LIBAV_IMPLEMENTATION) +# define PL_LIBAV_API static inline +# define PL_LIBAV_IMPLEMENTATION 1 +#else +# define PL_LIBAV_API +#endif + +PL_API_BEGIN + +#include <libavformat/avformat.h> +#include <libavutil/frame.h> +#include <libavutil/version.h> +#include <libavcodec/avcodec.h> + +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 16, 100) && defined(PL_HAVE_DOVI) +# define PL_HAVE_LAV_DOLBY_VISION +# include <libavutil/dovi_meta.h> +#endif + +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 61, 100) +# define PL_HAVE_LAV_FILM_GRAIN +# include <libavutil/film_grain_params.h> +#endif + +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 25, 100) +# define PL_HAVE_LAV_HDR +# include <libavutil/hdr_dynamic_metadata.h> +# include <libavutil/mastering_display_metadata.h> +#endif + +//------------------------------------------------------------------------ +// Important note: For support for AVVkFrame, which depends on <vulkan.h>, +// users *SHOULD* include <vulkan/vulkan.h> manually before this header. +//------------------------------------------------------------------------ + + +// Fill in the details of a `pl_frame` from an AVFrame. This function will +// explicitly clear `out_frame`, setting all extra fields to 0. After this +// function returns, the only missing data is information related to the plane +// texture itself (`planes[N].texture`), as well as any overlays (e.g. +// subtitles). +// +// Note: If the AVFrame contains an embedded ICC profile or H.274 film grain +// metadata, the resulting `out_image->profile` will reference this pointer, +// meaning that in general, the `pl_frame` is only guaranteed to be valid as +// long as the AVFrame is not freed. +// +// Note: This will ignore Dolby Vision metadata by default (to avoid leaking +// memory), either switch to pl_map_avframe_ex or do it manually using +// pl_map_dovi_metadata. +PL_LIBAV_API void pl_frame_from_avframe(struct pl_frame *out_frame, const AVFrame *frame); + +// Deprecated aliases for backwards compatibility +#define pl_image_from_avframe pl_frame_from_avframe +#define pl_target_from_avframe pl_frame_from_avframe + +// Copy extra metadata from an AVStream to a pl_frame. This should be called +// after `pl_frame_from_avframe` or `pl_map_avframe` (respectively), and sets +// metadata associated with stream-level side data. This is needed because +// FFmpeg rather annoyingly does not propagate stream-level metadata to frames. +PL_LIBAV_API void pl_frame_copy_stream_props(struct pl_frame *out_frame, + const AVStream *stream); + +#ifdef PL_HAVE_LAV_HDR +struct pl_av_hdr_metadata { + // All fields are optional and may be passed as `NULL`. + const AVMasteringDisplayMetadata *mdm; + const AVContentLightMetadata *clm; + const AVDynamicHDRPlus *dhp; +}; + +// Helper function to update a `pl_hdr_metadata` struct from HDR10/HDR10+ +// metadata in the FFmpeg format. Unspecified/invalid elements will be left +// uninitialized in `out`. +PL_LIBAV_API void pl_map_hdr_metadata(struct pl_hdr_metadata *out, + const struct pl_av_hdr_metadata *metadata); +#endif + +#ifdef PL_HAVE_LAV_DOLBY_VISION +// Helper function to map Dolby Vision metadata from the FFmpeg format. +PL_LIBAV_API void pl_map_dovi_metadata(struct pl_dovi_metadata *out, + const AVDOVIMetadata *metadata); + +// Helper function to map Dolby Vision metadata from the FFmpeg format +// to `pl_dovi_metadata`, and adds it to the `pl_frame`. +// The `pl_frame` colorspace fields and HDR struct are also updated with +// values from the `AVDOVIMetadata`. +// +// Note: The `pl_dovi_metadata` must be allocated externally. +// Also, currently the metadata is only used if the `AVDOVIRpuDataHeader` +// `disable_residual_flag` field is not zero and can be checked before allocating. +PL_LIBAV_API void pl_frame_map_avdovi_metadata(struct pl_frame *out_frame, + struct pl_dovi_metadata *dovi, + const AVDOVIMetadata *metadata); +#endif + +// Helper function to test if a pixfmt would be supported by the GPU. +// Essentially, this can be used to check if `pl_map_avframe` would work for a +// given AVPixelFormat, without actually uploading or allocating anything. +PL_LIBAV_API bool pl_test_pixfmt(pl_gpu gpu, enum AVPixelFormat pixfmt); + +// Variant of `pl_test_pixfmt` that also tests for the given capabilities +// being present. Note that in the presence of hardware accelerated frames, +// this cannot be tested without frame-specific information (i.e. swformat), +// but in practice this should be a non-issue as GPU-native hwformats will +// probably be fully supported. +PL_LIBAV_API bool pl_test_pixfmt_caps(pl_gpu gpu, enum AVPixelFormat pixfmt, + enum pl_fmt_caps caps); + +// Like `pl_frame_from_avframe`, but the texture pointers are also initialized +// to ensure they have the correct size and format to match the AVframe. +// Similar in spirit to `pl_recreate_plane`, and the same notes apply. `tex` +// must be an array of 4 pointers of type `pl_tex`, each either +// pointing to a valid texture, or NULL. Returns whether successful. +PL_LIBAV_API bool pl_frame_recreate_from_avframe(pl_gpu gpu, struct pl_frame *out_frame, + pl_tex tex[4], const AVFrame *frame); + +struct pl_avframe_params { + // The AVFrame to map. Required. + const AVFrame *frame; + + // Backing textures for frame data. Required for all non-hwdec formats. + // This must point to an array of four valid textures (or NULL entries). + // + // Note: Not cleaned up by `pl_unmap_avframe`. The intent is for users to + // re-use this texture array for subsequent frames, to avoid texture + // creation/destruction overhead. + pl_tex *tex; + + // Also map Dolby Vision metadata (if supported). Note that this also + // overrides the colorimetry metadata (forces BT.2020+PQ). + bool map_dovi; +}; + +#define PL_AVFRAME_DEFAULTS \ + .map_dovi = true, + +#define pl_avframe_params(...) (&(struct pl_avframe_params) { PL_AVFRAME_DEFAULTS __VA_ARGS__ }) + +// Very high level helper function to take an `AVFrame` and map it to the GPU. +// The resulting `pl_frame` remains valid until `pl_unmap_avframe` is called, +// which must be called at some point to clean up state. The `AVFrame` is +// automatically ref'd and unref'd if needed. Returns whether successful. +// +// Note: `out_frame->user_data` points to a privately managed opaque struct +// and must not be touched by the user. +PL_LIBAV_API bool pl_map_avframe_ex(pl_gpu gpu, struct pl_frame *out_frame, + const struct pl_avframe_params *params); +PL_LIBAV_API void pl_unmap_avframe(pl_gpu gpu, struct pl_frame *frame); + +// Backwards compatibility with previous versions of this API. +PL_LIBAV_API bool pl_map_avframe(pl_gpu gpu, struct pl_frame *out_frame, + pl_tex tex[4], const AVFrame *avframe); + +// Return the AVFrame* that a pl_frame was mapped from (via pl_map_avframe_ex) +// Note: This reference is attached to the `pl_frame` and will get freed by +// pl_unmap_avframe. +PL_LIBAV_API AVFrame *pl_get_mapped_avframe(const struct pl_frame *frame); + +// Download the texture contents of a `pl_frame` back to a corresponding +// AVFrame. Blocks until completion. +// +// Note: This function performs minimal verification, so incorrect usage will +// likely result in broken frames. Use `pl_frame_recreate_from_avframe` to +// ensure matching formats. +PL_LIBAV_API bool pl_download_avframe(pl_gpu gpu, + const struct pl_frame *frame, + AVFrame *out_frame); + +// Helper functions to update the colorimetry data in an AVFrame based on +// the values specified in the given color space / color repr / profile. +// +// Note: These functions can and will allocate AVFrame side data if needed, +// in particular to encode HDR metadata in `space.hdr`. +PL_LIBAV_API void pl_avframe_set_color(AVFrame *frame, struct pl_color_space space); +PL_LIBAV_API void pl_avframe_set_repr(AVFrame *frame, struct pl_color_repr repr); +PL_LIBAV_API void pl_avframe_set_profile(AVFrame *frame, struct pl_icc_profile profile); + +// Map an AVPixelFormat to an array of pl_plane_data structs. The array must +// have at least `av_pix_fmt_count_planes(fmt)` elements, but never more than +// 4. This function leaves `width`, `height` and `row_stride`, as well as the +// data pointers, uninitialized. +// +// If `bits` is non-NULL, this function will attempt aligning the resulting +// `pl_plane_data` struct for optimal compatibility, placing the resulting +// `pl_bit_depth` metadata into `bits`. +// +// Returns the number of plane structs written to, or 0 on error. +// +// Note: This function is usually clumsier to use than the higher-level +// functions above, but it might have some fringe use cases, for example if +// the user wants to replace the data buffers by `pl_buf` references in the +// `pl_plane_data` before uploading it to the GPU. +PL_LIBAV_API int pl_plane_data_from_pixfmt(struct pl_plane_data data[4], + struct pl_bit_encoding *bits, + enum AVPixelFormat pix_fmt); + +// Callback for AVCodecContext.get_buffer2 that allocates memory from +// persistently mapped buffers. This can be more efficient than regular +// system memory, especially on platforms that don't support importing +// PL_HANDLE_HOST_PTR as buffers. +// +// Note: `avctx->opaque` must be a pointer that *points* to the GPU instance. +// That is, it should have type `pl_gpu *`. +PL_LIBAV_API int pl_get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flags); + +// Mapping functions for the various libavutil enums. Note that these are not +// quite 1:1, and even for values that exist in both, the semantics sometimes +// differ. Some special cases (e.g. ICtCp, or XYZ) are handled differently in +// libplacebo and libavutil, respectively. +// +// Because of this, it's generally recommended to avoid these and instead use +// helpers like `pl_frame_from_avframe`, which contain extra logic to patch +// through all of the special cases. +PL_LIBAV_API enum pl_color_system pl_system_from_av(enum AVColorSpace spc); +PL_LIBAV_API enum AVColorSpace pl_system_to_av(enum pl_color_system sys); +PL_LIBAV_API enum pl_color_levels pl_levels_from_av(enum AVColorRange range); +PL_LIBAV_API enum AVColorRange pl_levels_to_av(enum pl_color_levels levels); +PL_LIBAV_API enum pl_color_primaries pl_primaries_from_av(enum AVColorPrimaries prim); +PL_LIBAV_API enum AVColorPrimaries pl_primaries_to_av(enum pl_color_primaries prim); +PL_LIBAV_API enum pl_color_transfer pl_transfer_from_av(enum AVColorTransferCharacteristic trc); +PL_LIBAV_API enum AVColorTransferCharacteristic pl_transfer_to_av(enum pl_color_transfer trc); +PL_LIBAV_API enum pl_chroma_location pl_chroma_from_av(enum AVChromaLocation loc); +PL_LIBAV_API enum AVChromaLocation pl_chroma_to_av(enum pl_chroma_location loc); + +// Helper function to generate a `pl_color_space` struct from an AVFrame. +PL_LIBAV_API void pl_color_space_from_avframe(struct pl_color_space *out_csp, + const AVFrame *frame); + +// Helper function to pick the right `pl_field` value for an AVFrame. +PL_LIBAV_API enum pl_field pl_field_from_avframe(const AVFrame *frame); + +#ifdef PL_HAVE_LAV_FILM_GRAIN +// Fill in film grain parameters from an AVFilmGrainParams. +// +// Note: The resulting struct will only remain valid as long as the +// `AVFilmGrainParams` remains valid. +PL_LIBAV_API void pl_film_grain_from_av(struct pl_film_grain_data *out_data, + const AVFilmGrainParams *fgp); +#endif + +// Deprecated alias for backwards compatibility +#define pl_swapchain_colors_from_avframe pl_color_space_from_avframe + +// Actual implementation, included as part of this header to avoid having +// a compile-time dependency on libavutil. +#if PL_LIBAV_IMPLEMENTATION +# include <libplacebo/utils/libav_internal.h> +#endif + +PL_API_END + +#endif // LIBPLACEBO_LIBAV_H_ diff --git a/src/include/libplacebo/utils/libav_internal.h b/src/include/libplacebo/utils/libav_internal.h new file mode 100644 index 0000000..4c269e5 --- /dev/null +++ b/src/include/libplacebo/utils/libav_internal.h @@ -0,0 +1,1482 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_LIBAV_H_ +#error This header should be included as part of <libplacebo/utils/libav.h> +#elif defined(__cplusplus) +#error This header cannot be included from C++ define PL_LIBAV_IMPLEMENTATION appropriately +#else + +#include <assert.h> + +#include <libplacebo/utils/dolbyvision.h> + +#include <libavutil/hwcontext.h> +#include <libavutil/hwcontext_drm.h> +#include <libavutil/imgutils.h> +#include <libavutil/pixdesc.h> +#include <libavutil/display.h> +#include <libavcodec/version.h> + +// Try importing <vulkan.h> dynamically if it wasn't already +#if !defined(VK_API_VERSION_1_2) && defined(__has_include) +# if __has_include(<vulkan/vulkan.h>) +# include <vulkan/vulkan.h> +# endif +#endif + +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 8, 100) && \ + defined(PL_HAVE_VULKAN) && defined(VK_API_VERSION_1_2) +# define PL_HAVE_LAV_VULKAN +# include <libavutil/hwcontext_vulkan.h> +# include <libplacebo/vulkan.h> +# if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 11, 100) +# define PL_HAVE_LAV_VULKAN_V2 +# endif +#endif + +PL_LIBAV_API enum pl_color_system pl_system_from_av(enum AVColorSpace spc) +{ + switch (spc) { + case AVCOL_SPC_RGB: return PL_COLOR_SYSTEM_RGB; + case AVCOL_SPC_BT709: return PL_COLOR_SYSTEM_BT_709; + case AVCOL_SPC_UNSPECIFIED: return PL_COLOR_SYSTEM_UNKNOWN; + case AVCOL_SPC_RESERVED: return PL_COLOR_SYSTEM_UNKNOWN; + case AVCOL_SPC_FCC: return PL_COLOR_SYSTEM_UNKNOWN; // missing + case AVCOL_SPC_BT470BG: return PL_COLOR_SYSTEM_BT_601; + case AVCOL_SPC_SMPTE170M: return PL_COLOR_SYSTEM_BT_601; + case AVCOL_SPC_SMPTE240M: return PL_COLOR_SYSTEM_SMPTE_240M; + case AVCOL_SPC_YCGCO: return PL_COLOR_SYSTEM_YCGCO; + case AVCOL_SPC_BT2020_NCL: return PL_COLOR_SYSTEM_BT_2020_NC; + case AVCOL_SPC_BT2020_CL: return PL_COLOR_SYSTEM_BT_2020_C; + case AVCOL_SPC_SMPTE2085: return PL_COLOR_SYSTEM_UNKNOWN; // missing + case AVCOL_SPC_CHROMA_DERIVED_NCL: return PL_COLOR_SYSTEM_UNKNOWN; // missing + case AVCOL_SPC_CHROMA_DERIVED_CL: return PL_COLOR_SYSTEM_UNKNOWN; // missing + // Note: this colorspace is confused between PQ and HLG, which libav* + // requires inferring from other sources, but libplacebo makes explicit. + // Default to PQ as it's the more common scenario. + case AVCOL_SPC_ICTCP: return PL_COLOR_SYSTEM_BT_2100_PQ; + case AVCOL_SPC_NB: return PL_COLOR_SYSTEM_COUNT; + } + + return PL_COLOR_SYSTEM_UNKNOWN; +} + +PL_LIBAV_API enum AVColorSpace pl_system_to_av(enum pl_color_system sys) +{ + switch (sys) { + case PL_COLOR_SYSTEM_UNKNOWN: return AVCOL_SPC_UNSPECIFIED; + case PL_COLOR_SYSTEM_BT_601: return AVCOL_SPC_SMPTE170M; + case PL_COLOR_SYSTEM_BT_709: return AVCOL_SPC_BT709; + case PL_COLOR_SYSTEM_SMPTE_240M: return AVCOL_SPC_SMPTE240M; + case PL_COLOR_SYSTEM_BT_2020_NC: return AVCOL_SPC_BT2020_NCL; + case PL_COLOR_SYSTEM_BT_2020_C: return AVCOL_SPC_BT2020_CL; + case PL_COLOR_SYSTEM_BT_2100_PQ: return AVCOL_SPC_ICTCP; + case PL_COLOR_SYSTEM_BT_2100_HLG: return AVCOL_SPC_ICTCP; + case PL_COLOR_SYSTEM_DOLBYVISION: return AVCOL_SPC_UNSPECIFIED; // missing + case PL_COLOR_SYSTEM_YCGCO: return AVCOL_SPC_YCGCO; + case PL_COLOR_SYSTEM_RGB: return AVCOL_SPC_RGB; + case PL_COLOR_SYSTEM_XYZ: return AVCOL_SPC_UNSPECIFIED; // handled differently + case PL_COLOR_SYSTEM_COUNT: return AVCOL_SPC_NB; + } + + return AVCOL_SPC_UNSPECIFIED; +} + +PL_LIBAV_API enum pl_color_levels pl_levels_from_av(enum AVColorRange range) +{ + switch (range) { + case AVCOL_RANGE_UNSPECIFIED: return PL_COLOR_LEVELS_UNKNOWN; + case AVCOL_RANGE_MPEG: return PL_COLOR_LEVELS_LIMITED; + case AVCOL_RANGE_JPEG: return PL_COLOR_LEVELS_FULL; + case AVCOL_RANGE_NB: return PL_COLOR_LEVELS_COUNT; + } + + return PL_COLOR_LEVELS_UNKNOWN; +} + +PL_LIBAV_API enum AVColorRange pl_levels_to_av(enum pl_color_levels levels) +{ + switch (levels) { + case PL_COLOR_LEVELS_UNKNOWN: return AVCOL_RANGE_UNSPECIFIED; + case PL_COLOR_LEVELS_LIMITED: return AVCOL_RANGE_MPEG; + case PL_COLOR_LEVELS_FULL: return AVCOL_RANGE_JPEG; + case PL_COLOR_LEVELS_COUNT: return AVCOL_RANGE_NB; + } + + return AVCOL_RANGE_UNSPECIFIED; +} + +PL_LIBAV_API enum pl_color_primaries pl_primaries_from_av(enum AVColorPrimaries prim) +{ + switch (prim) { + case AVCOL_PRI_RESERVED0: return PL_COLOR_PRIM_UNKNOWN; + case AVCOL_PRI_BT709: return PL_COLOR_PRIM_BT_709; + case AVCOL_PRI_UNSPECIFIED: return PL_COLOR_PRIM_UNKNOWN; + case AVCOL_PRI_RESERVED: return PL_COLOR_PRIM_UNKNOWN; + case AVCOL_PRI_BT470M: return PL_COLOR_PRIM_BT_470M; + case AVCOL_PRI_BT470BG: return PL_COLOR_PRIM_BT_601_625; + case AVCOL_PRI_SMPTE170M: return PL_COLOR_PRIM_BT_601_525; + case AVCOL_PRI_SMPTE240M: return PL_COLOR_PRIM_BT_601_525; + case AVCOL_PRI_FILM: return PL_COLOR_PRIM_FILM_C; + case AVCOL_PRI_BT2020: return PL_COLOR_PRIM_BT_2020; + case AVCOL_PRI_SMPTE428: return PL_COLOR_PRIM_CIE_1931; + case AVCOL_PRI_SMPTE431: return PL_COLOR_PRIM_DCI_P3; + case AVCOL_PRI_SMPTE432: return PL_COLOR_PRIM_DISPLAY_P3; + case AVCOL_PRI_JEDEC_P22: return PL_COLOR_PRIM_EBU_3213; + case AVCOL_PRI_NB: return PL_COLOR_PRIM_COUNT; + } + + return PL_COLOR_PRIM_UNKNOWN; +} + +PL_LIBAV_API enum AVColorPrimaries pl_primaries_to_av(enum pl_color_primaries prim) +{ + switch (prim) { + case PL_COLOR_PRIM_UNKNOWN: return AVCOL_PRI_UNSPECIFIED; + case PL_COLOR_PRIM_BT_601_525: return AVCOL_PRI_SMPTE170M; + case PL_COLOR_PRIM_BT_601_625: return AVCOL_PRI_BT470BG; + case PL_COLOR_PRIM_BT_709: return AVCOL_PRI_BT709; + case PL_COLOR_PRIM_BT_470M: return AVCOL_PRI_BT470M; + case PL_COLOR_PRIM_EBU_3213: return AVCOL_PRI_JEDEC_P22; + case PL_COLOR_PRIM_BT_2020: return AVCOL_PRI_BT2020; + case PL_COLOR_PRIM_APPLE: return AVCOL_PRI_UNSPECIFIED; // missing + case PL_COLOR_PRIM_ADOBE: return AVCOL_PRI_UNSPECIFIED; // missing + case PL_COLOR_PRIM_PRO_PHOTO: return AVCOL_PRI_UNSPECIFIED; // missing + case PL_COLOR_PRIM_CIE_1931: return AVCOL_PRI_SMPTE428; + case PL_COLOR_PRIM_DCI_P3: return AVCOL_PRI_SMPTE431; + case PL_COLOR_PRIM_DISPLAY_P3: return AVCOL_PRI_SMPTE432; + case PL_COLOR_PRIM_V_GAMUT: return AVCOL_PRI_UNSPECIFIED; // missing + case PL_COLOR_PRIM_S_GAMUT: return AVCOL_PRI_UNSPECIFIED; // missing + case PL_COLOR_PRIM_FILM_C: return AVCOL_PRI_FILM; + case PL_COLOR_PRIM_ACES_AP0: return AVCOL_PRI_UNSPECIFIED; // missing + case PL_COLOR_PRIM_ACES_AP1: return AVCOL_PRI_UNSPECIFIED; // missing + case PL_COLOR_PRIM_COUNT: return AVCOL_PRI_NB; + } + + return AVCOL_PRI_UNSPECIFIED; +} + +PL_LIBAV_API enum pl_color_transfer pl_transfer_from_av(enum AVColorTransferCharacteristic trc) +{ + switch (trc) { + case AVCOL_TRC_RESERVED0: return PL_COLOR_TRC_UNKNOWN; + case AVCOL_TRC_BT709: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case AVCOL_TRC_UNSPECIFIED: return PL_COLOR_TRC_UNKNOWN; + case AVCOL_TRC_RESERVED: return PL_COLOR_TRC_UNKNOWN; + case AVCOL_TRC_GAMMA22: return PL_COLOR_TRC_GAMMA22; + case AVCOL_TRC_GAMMA28: return PL_COLOR_TRC_GAMMA28; + case AVCOL_TRC_SMPTE170M: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case AVCOL_TRC_SMPTE240M: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case AVCOL_TRC_LINEAR: return PL_COLOR_TRC_LINEAR; + case AVCOL_TRC_LOG: return PL_COLOR_TRC_UNKNOWN; // missing + case AVCOL_TRC_LOG_SQRT: return PL_COLOR_TRC_UNKNOWN; // missing + case AVCOL_TRC_IEC61966_2_4: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case AVCOL_TRC_BT1361_ECG: return PL_COLOR_TRC_BT_1886; // ETOF != OETF + case AVCOL_TRC_IEC61966_2_1: return PL_COLOR_TRC_SRGB; + case AVCOL_TRC_BT2020_10: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case AVCOL_TRC_BT2020_12: return PL_COLOR_TRC_BT_1886; // EOTF != OETF + case AVCOL_TRC_SMPTE2084: return PL_COLOR_TRC_PQ; + case AVCOL_TRC_SMPTE428: return PL_COLOR_TRC_ST428; + case AVCOL_TRC_ARIB_STD_B67: return PL_COLOR_TRC_HLG; + case AVCOL_TRC_NB: return PL_COLOR_TRC_COUNT; + } + + return PL_COLOR_TRC_UNKNOWN; +} + +PL_LIBAV_API enum AVColorTransferCharacteristic pl_transfer_to_av(enum pl_color_transfer trc) +{ + switch (trc) { + case PL_COLOR_TRC_UNKNOWN: return AVCOL_TRC_UNSPECIFIED; + case PL_COLOR_TRC_BT_1886: return AVCOL_TRC_BT709; // EOTF != OETF + case PL_COLOR_TRC_SRGB: return AVCOL_TRC_IEC61966_2_1; + case PL_COLOR_TRC_LINEAR: return AVCOL_TRC_LINEAR; + case PL_COLOR_TRC_GAMMA18: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_GAMMA20: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_GAMMA22: return AVCOL_TRC_GAMMA22; + case PL_COLOR_TRC_GAMMA24: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_GAMMA26: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_GAMMA28: return AVCOL_TRC_GAMMA28; + case PL_COLOR_TRC_ST428: return AVCOL_TRC_SMPTE428; + case PL_COLOR_TRC_PRO_PHOTO: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_PQ: return AVCOL_TRC_SMPTE2084; + case PL_COLOR_TRC_HLG: return AVCOL_TRC_ARIB_STD_B67; + case PL_COLOR_TRC_V_LOG: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_S_LOG1: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_S_LOG2: return AVCOL_TRC_UNSPECIFIED; // missing + case PL_COLOR_TRC_COUNT: return AVCOL_TRC_NB; + } + + return AVCOL_TRC_UNSPECIFIED; +} + +PL_LIBAV_API enum pl_chroma_location pl_chroma_from_av(enum AVChromaLocation loc) +{ + switch (loc) { + case AVCHROMA_LOC_UNSPECIFIED: return PL_CHROMA_UNKNOWN; + case AVCHROMA_LOC_LEFT: return PL_CHROMA_LEFT; + case AVCHROMA_LOC_CENTER: return PL_CHROMA_CENTER; + case AVCHROMA_LOC_TOPLEFT: return PL_CHROMA_TOP_LEFT; + case AVCHROMA_LOC_TOP: return PL_CHROMA_TOP_CENTER; + case AVCHROMA_LOC_BOTTOMLEFT: return PL_CHROMA_BOTTOM_LEFT; + case AVCHROMA_LOC_BOTTOM: return PL_CHROMA_BOTTOM_CENTER; + case AVCHROMA_LOC_NB: return PL_CHROMA_COUNT; + } + + return PL_CHROMA_UNKNOWN; +} + +PL_LIBAV_API enum AVChromaLocation pl_chroma_to_av(enum pl_chroma_location loc) +{ + switch (loc) { + case PL_CHROMA_UNKNOWN: return AVCHROMA_LOC_UNSPECIFIED; + case PL_CHROMA_LEFT: return AVCHROMA_LOC_LEFT; + case PL_CHROMA_CENTER: return AVCHROMA_LOC_CENTER; + case PL_CHROMA_TOP_LEFT: return AVCHROMA_LOC_TOPLEFT; + case PL_CHROMA_TOP_CENTER: return AVCHROMA_LOC_TOP; + case PL_CHROMA_BOTTOM_LEFT: return AVCHROMA_LOC_BOTTOMLEFT; + case PL_CHROMA_BOTTOM_CENTER: return AVCHROMA_LOC_BOTTOM; + case PL_CHROMA_COUNT: return AVCHROMA_LOC_NB; + } + + return AVCHROMA_LOC_UNSPECIFIED; +} + +#ifdef PL_HAVE_LAV_HDR +PL_LIBAV_API void pl_map_hdr_metadata(struct pl_hdr_metadata *out, + const struct pl_av_hdr_metadata *data) +{ + if (data->mdm) { + if (data->mdm->has_luminance) { + out->max_luma = av_q2d(data->mdm->max_luminance); + out->min_luma = av_q2d(data->mdm->min_luminance); + if (out->max_luma < 10.0 || out->min_luma >= out->max_luma) + out->max_luma = out->min_luma = 0; /* sanity */ + } + if (data->mdm->has_primaries) { + out->prim = (struct pl_raw_primaries) { + .red.x = av_q2d(data->mdm->display_primaries[0][0]), + .red.y = av_q2d(data->mdm->display_primaries[0][1]), + .green.x = av_q2d(data->mdm->display_primaries[1][0]), + .green.y = av_q2d(data->mdm->display_primaries[1][1]), + .blue.x = av_q2d(data->mdm->display_primaries[2][0]), + .blue.y = av_q2d(data->mdm->display_primaries[2][1]), + .white.x = av_q2d(data->mdm->white_point[0]), + .white.y = av_q2d(data->mdm->white_point[1]), + }; + } + } + + if (data->clm) { + out->max_cll = data->clm->MaxCLL; + out->max_fall = data->clm->MaxFALL; + } + + if (data->dhp && data->dhp->application_version < 2) { + float hist_max = 0; + const AVHDRPlusColorTransformParams *pars = &data->dhp->params[0]; + assert(data->dhp->num_windows > 0); + out->scene_max[0] = 10000 * av_q2d(pars->maxscl[0]); + out->scene_max[1] = 10000 * av_q2d(pars->maxscl[1]); + out->scene_max[2] = 10000 * av_q2d(pars->maxscl[2]); + out->scene_avg = 10000 * av_q2d(pars->average_maxrgb); + + // Calculate largest value from histogram to use as fallback for clips + // with missing MaxSCL information. Note that this may end up picking + // the "reserved" value at the 5% percentile, which in practice appears + // to track the brightest pixel in the scene. + for (int i = 0; i < pars->num_distribution_maxrgb_percentiles; i++) { + float hist_val = av_q2d(pars->distribution_maxrgb[i].percentile); + if (hist_val > hist_max) + hist_max = hist_val; + } + hist_max *= 10000; + if (!out->scene_max[0]) + out->scene_max[0] = hist_max; + if (!out->scene_max[1]) + out->scene_max[1] = hist_max; + if (!out->scene_max[2]) + out->scene_max[2] = hist_max; + + if (pars->tone_mapping_flag == 1) { + out->ootf.target_luma = av_q2d(data->dhp->targeted_system_display_maximum_luminance); + out->ootf.knee_x = av_q2d(pars->knee_point_x); + out->ootf.knee_y = av_q2d(pars->knee_point_y); + assert(pars->num_bezier_curve_anchors < 16); + for (int i = 0; i < pars->num_bezier_curve_anchors; i++) + out->ootf.anchors[i] = av_q2d(pars->bezier_curve_anchors[i]); + out->ootf.num_anchors = pars->num_bezier_curve_anchors; + } + } +} +#endif // PL_HAVE_LAV_HDR + +static inline void *pl_get_side_data_raw(const AVFrame *frame, + enum AVFrameSideDataType type) +{ + const AVFrameSideData *sd = av_frame_get_side_data(frame, type); + return sd ? (void *) sd->data : NULL; +} + +PL_LIBAV_API void pl_color_space_from_avframe(struct pl_color_space *out_csp, + const AVFrame *frame) +{ + *out_csp = (struct pl_color_space) { + .primaries = pl_primaries_from_av(frame->color_primaries), + .transfer = pl_transfer_from_av(frame->color_trc), + }; + +#ifdef PL_HAVE_LAV_HDR + pl_map_hdr_metadata(&out_csp->hdr, &(struct pl_av_hdr_metadata) { + .mdm = pl_get_side_data_raw(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA), + .clm = pl_get_side_data_raw(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL), + .dhp = pl_get_side_data_raw(frame, AV_FRAME_DATA_DYNAMIC_HDR_PLUS), + }); +#endif +} + +PL_LIBAV_API enum pl_field pl_field_from_avframe(const AVFrame *frame) +{ +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 7, 100) + if (!frame || !(frame->flags & AV_FRAME_FLAG_INTERLACED)) + return PL_FIELD_NONE; + return (frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) + ? PL_FIELD_TOP : PL_FIELD_BOTTOM; +#else + if (!frame || !frame->interlaced_frame) + return PL_FIELD_NONE; + return frame->top_field_first ? PL_FIELD_TOP : PL_FIELD_BOTTOM; +#endif +} + +#ifdef PL_HAVE_LAV_FILM_GRAIN +PL_LIBAV_API void pl_film_grain_from_av(struct pl_film_grain_data *out_data, + const AVFilmGrainParams *fgp) +{ + out_data->seed = fgp->seed; + + switch (fgp->type) { + case AV_FILM_GRAIN_PARAMS_NONE: break; + case AV_FILM_GRAIN_PARAMS_AV1: { + const AVFilmGrainAOMParams *src = &fgp->codec.aom; + struct pl_av1_grain_data *dst = &out_data->params.av1; + out_data->type = PL_FILM_GRAIN_AV1; + *dst = (struct pl_av1_grain_data) { + .num_points_y = src->num_y_points, + .chroma_scaling_from_luma = src->chroma_scaling_from_luma, + .num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] }, + .scaling_shift = src->scaling_shift, + .ar_coeff_lag = src->ar_coeff_lag, + .ar_coeff_shift = src->ar_coeff_shift, + .grain_scale_shift = src->grain_scale_shift, + .uv_mult = { src->uv_mult[0], src->uv_mult[1] }, + .uv_mult_luma = { src->uv_mult_luma[0], src->uv_mult_luma[1] }, + .uv_offset = { src->uv_offset[0], src->uv_offset[1] }, + .overlap = src->overlap_flag, + }; + + assert(sizeof(dst->ar_coeffs_uv) == sizeof(src->ar_coeffs_uv)); + memcpy(dst->points_y, src->y_points, sizeof(dst->points_y)); + memcpy(dst->points_uv, src->uv_points, sizeof(dst->points_uv)); + memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(dst->ar_coeffs_y)); + memcpy(dst->ar_coeffs_uv, src->ar_coeffs_uv, sizeof(dst->ar_coeffs_uv)); + break; + } +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 2, 100) + case AV_FILM_GRAIN_PARAMS_H274: { + const AVFilmGrainH274Params *src = &fgp->codec.h274; + struct pl_h274_grain_data *dst = &out_data->params.h274; + out_data->type = PL_FILM_GRAIN_H274; + *dst = (struct pl_h274_grain_data) { + .model_id = src->model_id, + .blending_mode_id = src->blending_mode_id, + .log2_scale_factor = src->log2_scale_factor, + .component_model_present = { + src->component_model_present[0], + src->component_model_present[1], + src->component_model_present[2], + }, + .intensity_interval_lower_bound = { + src->intensity_interval_lower_bound[0], + src->intensity_interval_lower_bound[1], + src->intensity_interval_lower_bound[2], + }, + .intensity_interval_upper_bound = { + src->intensity_interval_upper_bound[0], + src->intensity_interval_upper_bound[1], + src->intensity_interval_upper_bound[2], + }, + .comp_model_value = { + src->comp_model_value[0], + src->comp_model_value[1], + src->comp_model_value[2], + }, + }; + memcpy(dst->num_intensity_intervals, src->num_intensity_intervals, + sizeof(dst->num_intensity_intervals)); + memcpy(dst->num_model_values, src->num_model_values, + sizeof(dst->num_model_values)); + break; + } +#endif + } +} +#endif // PL_HAVE_LAV_FILM_GRAIN + +static inline int pl_plane_data_num_comps(const struct pl_plane_data *data) +{ + for (int i = 0; i < 4; i++) { + if (data->component_size[i] == 0) + return i; + } + + return 4; +} + +PL_LIBAV_API int pl_plane_data_from_pixfmt(struct pl_plane_data out_data[4], + struct pl_bit_encoding *out_bits, + enum AVPixelFormat pix_fmt) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); + int planes = av_pix_fmt_count_planes(pix_fmt); + struct pl_plane_data aligned_data[4]; + struct pl_bit_encoding bits; + bool first; + if (!desc || planes < 0) // e.g. AV_PIX_FMT_NONE + return 0; + + if (desc->flags & AV_PIX_FMT_FLAG_BITSTREAM) { + // Bitstream formats will most likely never be supported + return 0; + } + + if (desc->flags & AV_PIX_FMT_FLAG_PAL) { + // Palette formats are (currently) not supported + return 0; + } + + if (desc->flags & AV_PIX_FMT_FLAG_BAYER) { + // Bayer format don't have valid `desc->offset` values, so we can't + // use `pl_plane_data_from_mask` on them. + return 0; + } + + if (desc->nb_components == 0 || desc->nb_components > 4) { + // Bogus components, possibly fake/virtual/hwaccel format? + return 0; + } + + if (planes > 4) + return 0; // This shouldn't ever happen + + // Fill in the details for each plane + for (int p = 0; p < planes; p++) { + struct pl_plane_data *data = &out_data[p]; + int size[4] = {0}; + int shift[4] = {0}; + data->swapped = desc->flags & AV_PIX_FMT_FLAG_BE; + data->type = (desc->flags & AV_PIX_FMT_FLAG_FLOAT) + ? PL_FMT_FLOAT + : PL_FMT_UNORM; + + data->pixel_stride = 0; + + for (int c = 0; c < desc->nb_components; c++) { + const AVComponentDescriptor *comp = &desc->comp[c]; + if (comp->plane != p) + continue; + if (data->swapped && comp->shift) { + // We cannot naively handle packed big endian formats because + // swapping the words also swaps the component order, so just + // exit out as a stupid safety measure + return 0; + } + + size[c] = comp->depth; + shift[c] = comp->shift + comp->offset * 8; + + if (data->pixel_stride && (int) data->pixel_stride != comp->step) { + // Pixel format contains components with different pixel stride + // (e.g. packed YUYV), this is currently not supported + return 0; + } + data->pixel_stride = comp->step; + } + + pl_plane_data_from_comps(data, size, shift); + } + + if (!out_bits) + return planes; + + // Attempt aligning all of the planes for optimum compatibility + first = true; + for (int p = 0; p < planes; p++) { + aligned_data[p] = out_data[p]; + + // Planes with only an alpha component should be ignored + if (pl_plane_data_num_comps(&aligned_data[p]) == 1 && + aligned_data[p].component_map[0] == PL_CHANNEL_A) + { + continue; + } + + if (!pl_plane_data_align(&aligned_data[p], &bits)) + goto misaligned; + + if (first) { + *out_bits = bits; + first = false; + } else { + if (!pl_bit_encoding_equal(&bits, out_bits)) + goto misaligned; + } + } + + // Overwrite the planes by their aligned versions + for (int p = 0; p < planes; p++) + out_data[p] = aligned_data[p]; + + return planes; + +misaligned: + *out_bits = (struct pl_bit_encoding) {0}; + return planes; +} + +PL_LIBAV_API bool pl_test_pixfmt_caps(pl_gpu gpu, enum AVPixelFormat pixfmt, + enum pl_fmt_caps caps) +{ + struct pl_bit_encoding bits; + struct pl_plane_data data[4]; + pl_fmt fmt; + int planes; + + switch (pixfmt) { + case AV_PIX_FMT_DRM_PRIME: + case AV_PIX_FMT_VAAPI: + return gpu->import_caps.tex & PL_HANDLE_DMA_BUF; + +#ifdef PL_HAVE_LAV_VULKAN + case AV_PIX_FMT_VULKAN: + return pl_vulkan_get(gpu); +#endif + + default: break; + } + + planes = pl_plane_data_from_pixfmt(data, &bits, pixfmt); + if (!planes) + return false; + + for (int i = 0; i < planes; i++) { + data[i].row_stride = 0; + fmt = pl_plane_find_fmt(gpu, NULL, &data[i]); + if (!fmt || (fmt->caps & caps) != caps) + return false; + + } + + return true; +} + +PL_LIBAV_API bool pl_test_pixfmt(pl_gpu gpu, enum AVPixelFormat pixfmt) +{ + return pl_test_pixfmt_caps(gpu, pixfmt, 0); +} + +PL_LIBAV_API void pl_avframe_set_color(AVFrame *frame, struct pl_color_space csp) +{ + const AVFrameSideData *sd; + (void) sd; + + frame->color_primaries = pl_primaries_to_av(csp.primaries); + frame->color_trc = pl_transfer_to_av(csp.transfer); + +#ifdef PL_HAVE_LAV_HDR + if (csp.hdr.max_cll) { + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL); + if (!sd) { + sd = av_frame_new_side_data(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL, + sizeof(AVContentLightMetadata)); + } + + if (sd) { + AVContentLightMetadata *clm = (AVContentLightMetadata *) sd->data; + *clm = (AVContentLightMetadata) { + .MaxCLL = csp.hdr.max_cll, + .MaxFALL = csp.hdr.max_fall, + }; + } + } + + if (csp.hdr.max_luma || csp.hdr.prim.red.x) { + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA); + if (!sd) { + sd = av_frame_new_side_data(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA, + sizeof(AVMasteringDisplayMetadata)); + } + + if (sd) { + AVMasteringDisplayMetadata *mdm = (AVMasteringDisplayMetadata *) sd->data; + *mdm = (AVMasteringDisplayMetadata) { + .max_luminance = av_d2q(csp.hdr.max_luma, 1000000), + .min_luminance = av_d2q(csp.hdr.min_luma, 1000000), + .has_luminance = !!csp.hdr.max_luma, + .display_primaries = { + { + av_d2q(csp.hdr.prim.red.x, 1000000), + av_d2q(csp.hdr.prim.red.y, 1000000), + }, { + av_d2q(csp.hdr.prim.green.x, 1000000), + av_d2q(csp.hdr.prim.green.y, 1000000), + }, { + av_d2q(csp.hdr.prim.blue.x, 1000000), + av_d2q(csp.hdr.prim.blue.y, 1000000), + } + }, + .white_point = { + av_d2q(csp.hdr.prim.white.x, 1000000), + av_d2q(csp.hdr.prim.white.y, 1000000), + }, + .has_primaries = !!csp.hdr.prim.red.x, + }; + } + } +#endif // PL_HAVE_LAV_HDR +} + +PL_LIBAV_API void pl_avframe_set_repr(AVFrame *frame, struct pl_color_repr repr) +{ + frame->colorspace = pl_system_to_av(repr.sys); + frame->color_range = pl_levels_to_av(repr.levels); + + // No real way to map repr.bits, the image format already has to match +} + +PL_LIBAV_API void pl_avframe_set_profile(AVFrame *frame, struct pl_icc_profile profile) +{ + const AVFrameSideData *sd; + av_frame_remove_side_data(frame, AV_FRAME_DATA_ICC_PROFILE); + + if (!profile.len) + return; + + sd = av_frame_new_side_data(frame, AV_FRAME_DATA_ICC_PROFILE, profile.len); + memcpy(sd->data, profile.data, profile.len); +} + +PL_LIBAV_API void pl_frame_from_avframe(struct pl_frame *out, + const AVFrame *frame) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); + int planes = av_pix_fmt_count_planes(frame->format); + const AVFrameSideData *sd; + assert(desc); + + if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) { + const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data; + desc = av_pix_fmt_desc_get(hwfc->sw_format); + planes = av_pix_fmt_count_planes(hwfc->sw_format); + } + + // This should never fail, and there's nothing really useful we can do in + // this failure case anyway, since this is a `void` function. + assert(planes <= 4); + + *out = (struct pl_frame) { + .num_planes = planes, + .crop = { + .x0 = frame->crop_left, + .y0 = frame->crop_top, + .x1 = frame->width - frame->crop_right, + .y1 = frame->height - frame->crop_bottom, + }, + .repr = { + .sys = pl_system_from_av(frame->colorspace), + .levels = pl_levels_from_av(frame->color_range), + .alpha = (desc->flags & AV_PIX_FMT_FLAG_ALPHA) + ? PL_ALPHA_INDEPENDENT + : PL_ALPHA_UNKNOWN, + + // For sake of simplicity, just use the first component's depth as + // the authoritative color depth for the whole image. Usually, this + // will be overwritten by more specific information when using e.g. + // `pl_map_avframe`, but for the sake of e.g. users wishing to map + // hwaccel frames manually, this is a good default. + .bits.color_depth = desc->comp[0].depth, + }, + }; + + pl_color_space_from_avframe(&out->color, frame); + + if (frame->colorspace == AVCOL_SPC_ICTCP && + frame->color_trc == AVCOL_TRC_ARIB_STD_B67) + { + // libav* makes no distinction between PQ and HLG ICtCp, so we need + // to manually fix it in the case that we have HLG ICtCp data. + out->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG; + + } else if (strncmp(desc->name, "xyz", 3) == 0) { + + // libav* handles this as a special case, but doesn't provide an + // explicit flag for it either, so we have to resort to this ugly + // hack... + out->repr.sys = PL_COLOR_SYSTEM_XYZ; + + } else if (desc->flags & AV_PIX_FMT_FLAG_RGB) { + + out->repr.sys = PL_COLOR_SYSTEM_RGB; + out->repr.levels = PL_COLOR_LEVELS_FULL; // libav* ignores levels for RGB + + } else if (!pl_color_system_is_ycbcr_like(out->repr.sys)) { + // libav* likes leaving this as UNKNOWN (or even RGB) for YCbCr frames, + // which confuses libplacebo since we infer UNKNOWN as RGB. To get + // around this, explicitly infer a suitable colorspace. + out->repr.sys = pl_color_system_guess_ycbcr(frame->width, frame->height); + } + + if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_ICC_PROFILE))) { + out->profile = (struct pl_icc_profile) { + .data = sd->data, + .len = sd->size, + }; + + // Needed to ensure profile uniqueness + pl_icc_profile_compute_signature(&out->profile); + } + + if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX))) { + double rot = av_display_rotation_get((const int32_t *) sd->data); + out->rotation = pl_rotation_normalize(4.5 - rot / 90.0); + } + +#ifdef PL_HAVE_LAV_FILM_GRAIN + if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_FILM_GRAIN_PARAMS))) + pl_film_grain_from_av(&out->film_grain, (AVFilmGrainParams *) sd->data); +#endif // HAVE_LAV_FILM_GRAIN + + for (int p = 0; p < out->num_planes; p++) { + struct pl_plane *plane = &out->planes[p]; + + // Fill in the component mapping array + for (int c = 0; c < desc->nb_components; c++) { + if (desc->comp[c].plane == p) + plane->component_mapping[plane->components++] = c; + } + + // Clear the superfluous components + for (int c = plane->components; c < 4; c++) + plane->component_mapping[c] = PL_CHANNEL_NONE; + } + + // Only set the chroma location for definitely subsampled images, makes no + // sense otherwise + if (desc->log2_chroma_w || desc->log2_chroma_h) { + enum pl_chroma_location loc = pl_chroma_from_av(frame->chroma_location); + pl_frame_set_chroma_location(out, loc); + } +} + +#if LIBAVFORMAT_VERSION_INT >= AV_VERSION_INT(60, 15, 100) +PL_LIBAV_API const uint8_t *pl_av_stream_get_side_data(const AVStream *st, + enum AVPacketSideDataType type) +{ + const AVPacketSideData *sd; + sd = av_packet_side_data_get(st->codecpar->coded_side_data, + st->codecpar->nb_coded_side_data, + type); + return sd ? sd->data : NULL; +} +#else +# define pl_av_stream_get_side_data(st, type) av_stream_get_side_data(st, type, NULL) +#endif + +PL_LIBAV_API void pl_frame_copy_stream_props(struct pl_frame *out, + const AVStream *stream) +{ + const uint8_t *sd; + if ((sd = pl_av_stream_get_side_data(stream, AV_PKT_DATA_DISPLAYMATRIX))) { + double rot = av_display_rotation_get((const int32_t *) sd); + out->rotation = pl_rotation_normalize(4.5 - rot / 90.0); + } + +#ifdef PL_HAVE_LAV_HDR + pl_map_hdr_metadata(&out->color.hdr, &(struct pl_av_hdr_metadata) { + .mdm = (void *) pl_av_stream_get_side_data(stream, + AV_PKT_DATA_MASTERING_DISPLAY_METADATA), + .clm = (void *) pl_av_stream_get_side_data(stream, + AV_PKT_DATA_CONTENT_LIGHT_LEVEL), +# if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(59, 2, 100) + .dhp = (void *) pl_av_stream_get_side_data(stream, + AV_PKT_DATA_DYNAMIC_HDR10_PLUS), +# endif + }); +#endif +} + +#undef pl_av_stream_get_side_data + +#ifdef PL_HAVE_LAV_DOLBY_VISION +PL_LIBAV_API void pl_map_dovi_metadata(struct pl_dovi_metadata *out, + const AVDOVIMetadata *data) +{ + const AVDOVIRpuDataHeader *header; + const AVDOVIDataMapping *mapping; + const AVDOVIColorMetadata *color; + if (!data) + return; + + header = av_dovi_get_header(data); + mapping = av_dovi_get_mapping(data); + color = av_dovi_get_color(data); + + for (int i = 0; i < 3; i++) + out->nonlinear_offset[i] = av_q2d(color->ycc_to_rgb_offset[i]); + for (int i = 0; i < 9; i++) { + float *nonlinear = &out->nonlinear.m[0][0]; + float *linear = &out->linear.m[0][0]; + nonlinear[i] = av_q2d(color->ycc_to_rgb_matrix[i]); + linear[i] = av_q2d(color->rgb_to_lms_matrix[i]); + } + for (int c = 0; c < 3; c++) { + const AVDOVIReshapingCurve *csrc = &mapping->curves[c]; + struct pl_reshape_data *cdst = &out->comp[c]; + cdst->num_pivots = csrc->num_pivots; + for (int i = 0; i < csrc->num_pivots; i++) { + const float scale = 1.0f / ((1 << header->bl_bit_depth) - 1); + cdst->pivots[i] = scale * csrc->pivots[i]; + } + for (int i = 0; i < csrc->num_pivots - 1; i++) { + const float scale = 1.0f / (1 << header->coef_log2_denom); + cdst->method[i] = csrc->mapping_idc[i]; + switch (csrc->mapping_idc[i]) { + case AV_DOVI_MAPPING_POLYNOMIAL: + for (int k = 0; k < 3; k++) { + cdst->poly_coeffs[i][k] = (k <= csrc->poly_order[i]) + ? scale * csrc->poly_coef[i][k] + : 0.0f; + } + break; + case AV_DOVI_MAPPING_MMR: + cdst->mmr_order[i] = csrc->mmr_order[i]; + cdst->mmr_constant[i] = scale * csrc->mmr_constant[i]; + for (int j = 0; j < csrc->mmr_order[i]; j++) { + for (int k = 0; k < 7; k++) + cdst->mmr_coeffs[i][j][k] = scale * csrc->mmr_coef[i][j][k]; + } + break; + } + } + } +} + +PL_LIBAV_API void pl_frame_map_avdovi_metadata(struct pl_frame *out_frame, + struct pl_dovi_metadata *dovi, + const AVDOVIMetadata *metadata) +{ + const AVDOVIRpuDataHeader *header; + const AVDOVIColorMetadata *color; + if (!dovi || !metadata) + return; + + header = av_dovi_get_header(metadata); + color = av_dovi_get_color(metadata); + if (header->disable_residual_flag) { + pl_map_dovi_metadata(dovi, metadata); + + out_frame->repr.dovi = dovi; + out_frame->repr.sys = PL_COLOR_SYSTEM_DOLBYVISION; + out_frame->color.primaries = PL_COLOR_PRIM_BT_2020; + out_frame->color.transfer = PL_COLOR_TRC_PQ; + out_frame->color.hdr.min_luma = + pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, color->source_min_pq / 4095.0f); + out_frame->color.hdr.max_luma = + pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, color->source_max_pq / 4095.0f); + } +} +#endif // PL_HAVE_LAV_DOLBY_VISION + +PL_LIBAV_API bool pl_frame_recreate_from_avframe(pl_gpu gpu, + struct pl_frame *out, + pl_tex tex[4], + const AVFrame *frame) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); + struct pl_plane_data data[4] = {0}; + int planes; + + pl_frame_from_avframe(out, frame); + planes = pl_plane_data_from_pixfmt(data, &out->repr.bits, frame->format); + if (!planes) + return false; + + for (int p = 0; p < planes; p++) { + bool is_chroma = p == 1 || p == 2; // matches lavu logic + data[p].width = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0); + data[p].height = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0); + + if (!pl_recreate_plane(gpu, &out->planes[p], &tex[p], &data[p])) + return false; + } + + return true; +} + +static void pl_avframe_free_cb(void *priv) +{ + AVFrame *frame = priv; + av_frame_free(&frame); +} + +#define PL_MAGIC0 0xfb5b3b8b +#define PL_MAGIC1 0xee659f6d + +struct pl_avalloc { + uint32_t magic[2]; + pl_gpu gpu; + pl_buf buf; +}; + +// Attached to `pl_frame.user_data` for mapped AVFrames +struct pl_avframe_priv { + AVFrame *avframe; + struct pl_dovi_metadata dovi; // backing storage for per-frame dovi metadata + pl_tex planar; // for planar vulkan textures +}; + +static void pl_fix_hwframe_sample_depth(struct pl_frame *out, const AVFrame *frame) +{ + const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data; + pl_fmt fmt = out->planes[0].texture->params.format; + struct pl_bit_encoding *bits = &out->repr.bits; + + bits->sample_depth = fmt->component_depth[0]; + + switch (hwfc->sw_format) { + case AV_PIX_FMT_P010: bits->bit_shift = 6; break; + default: break; + } +} + +static bool pl_map_avframe_drm(pl_gpu gpu, struct pl_frame *out, + const AVFrame *frame) +{ + const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format); + const AVDRMFrameDescriptor *drm = (AVDRMFrameDescriptor *) frame->data[0]; + assert(frame->format == AV_PIX_FMT_DRM_PRIME); + if (!(gpu->import_caps.tex & PL_HANDLE_DMA_BUF)) + return false; + + assert(drm->nb_layers >= out->num_planes); + for (int n = 0; n < out->num_planes; n++) { + const AVDRMLayerDescriptor *layer = &drm->layers[n]; + const AVDRMPlaneDescriptor *plane = &layer->planes[0]; + const AVDRMObjectDescriptor *object = &drm->objects[plane->object_index]; + pl_fmt fmt = pl_find_fourcc(gpu, layer->format); + bool is_chroma = n == 1 || n == 2; + if (!fmt || !pl_fmt_has_modifier(fmt, object->format_modifier)) + return false; + + assert(layer->nb_planes == 1); // we only support planar formats + assert(plane->pitch >= 0); // definitely requires special handling + out->planes[n].texture = pl_tex_create(gpu, pl_tex_params( + .w = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0), + .h = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0), + .format = fmt, + .sampleable = true, + .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE, + .import_handle = PL_HANDLE_DMA_BUF, + .shared_mem = { + .handle.fd = object->fd, + .size = object->size, + .offset = plane->offset, + .drm_format_mod = object->format_modifier, + .stride_w = plane->pitch, + }, + )); + if (!out->planes[n].texture) + return false; + } + + pl_fix_hwframe_sample_depth(out, frame); + return true; +} + +// Derive a DMABUF from any other hwaccel format, and map that instead +static bool pl_map_avframe_derived(pl_gpu gpu, struct pl_frame *out, + const AVFrame *frame) +{ + const int flags = AV_HWFRAME_MAP_READ | AV_HWFRAME_MAP_DIRECT; + struct pl_avframe_priv *priv = out->user_data; + AVFrame *derived = av_frame_alloc(); + derived->width = frame->width; + derived->height = frame->height; + derived->format = AV_PIX_FMT_DRM_PRIME; + derived->hw_frames_ctx = av_buffer_ref(frame->hw_frames_ctx); + if (av_hwframe_map(derived, frame, flags) < 0) + goto error; + if (av_frame_copy_props(derived, frame) < 0) + goto error; + if (!pl_map_avframe_drm(gpu, out, derived)) + goto error; + + av_frame_free(&priv->avframe); + priv->avframe = derived; + return true; + +error: + av_frame_free(&derived); + return false; +} + +#ifdef PL_HAVE_LAV_VULKAN +static bool pl_acquire_avframe(pl_gpu gpu, struct pl_frame *frame) +{ + const struct pl_avframe_priv *priv = frame->user_data; + AVHWFramesContext *hwfc = (void *) priv->avframe->hw_frames_ctx->data; + AVVulkanFramesContext *vkfc = hwfc->hwctx; + AVVkFrame *vkf = (AVVkFrame *) priv->avframe->data[0]; + +#ifdef PL_HAVE_LAV_VULKAN_V2 + vkfc->lock_frame(hwfc, vkf); +#else + (void) vkfc; +#endif + + for (int n = 0; n < frame->num_planes; n++) { + pl_vulkan_release_ex(gpu, pl_vulkan_release_params( + .tex = priv->planar ? priv->planar : frame->planes[n].texture, + .layout = vkf->layout[n], + .qf = VK_QUEUE_FAMILY_IGNORED, + .semaphore = { + .sem = vkf->sem[n], + .value = vkf->sem_value[n], + }, + )); + if (priv->planar) + break; + } + + return true; +} + +static void pl_release_avframe(pl_gpu gpu, struct pl_frame *frame) +{ + const struct pl_avframe_priv *priv = frame->user_data; + AVHWFramesContext *hwfc = (void *) priv->avframe->hw_frames_ctx->data; + AVVulkanFramesContext *vkfc = hwfc->hwctx; + AVVkFrame *vkf = (AVVkFrame *) priv->avframe->data[0]; + + for (int n = 0; n < frame->num_planes; n++) { + int ok = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = priv->planar ? priv->planar : frame->planes[n].texture, + .out_layout = &vkf->layout[n], + .qf = VK_QUEUE_FAMILY_IGNORED, + .semaphore = { + .sem = vkf->sem[n], + .value = vkf->sem_value[n] + 1, + }, + )); + + vkf->access[n] = 0; + vkf->sem_value[n] += !!ok; + if (priv->planar) + break; + } + +#ifdef PL_HAVE_LAV_VULKAN_V2 + vkfc->unlock_frame(hwfc, vkf); +#else + (void) vkfc; +#endif +} + +static bool pl_map_avframe_vulkan(pl_gpu gpu, struct pl_frame *out, + const AVFrame *frame) +{ + const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format); + const AVVulkanFramesContext *vkfc = hwfc->hwctx; + AVVkFrame *vkf = (AVVkFrame *) frame->data[0]; + struct pl_avframe_priv *priv = out->user_data; + pl_vulkan vk = pl_vulkan_get(gpu); + +#ifdef PL_HAVE_LAV_VULKAN_V2 + const VkFormat *vk_fmt = vkfc->format; +#else + const VkFormat *vk_fmt = av_vkfmt_from_pixfmt(hwfc->sw_format); +#endif + + assert(frame->format == AV_PIX_FMT_VULKAN); + priv->planar = NULL; + if (!vk) + return false; + + for (int n = 0; n < out->num_planes; n++) { + struct pl_plane *plane = &out->planes[n]; + bool chroma = n == 1 || n == 2; + int num_subplanes; + assert(vk_fmt[n]); + + plane->texture = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params( + .image = vkf->img[n], + .width = AV_CEIL_RSHIFT(hwfc->width, chroma ? desc->log2_chroma_w : 0), + .height = AV_CEIL_RSHIFT(hwfc->height, chroma ? desc->log2_chroma_h : 0), + .format = vk_fmt[n], + .usage = vkfc->usage, + )); + if (!plane->texture) + return false; + + num_subplanes = plane->texture->params.format->num_planes; + if (num_subplanes) { + assert(num_subplanes == out->num_planes); + priv->planar = plane->texture; + for (int i = 0; i < num_subplanes; i++) + out->planes[i].texture = priv->planar->planes[i]; + break; + } + } + + out->acquire = pl_acquire_avframe; + out->release = pl_release_avframe; + pl_fix_hwframe_sample_depth(out, frame); + return true; +} + +static void pl_unmap_avframe_vulkan(pl_gpu gpu, struct pl_frame *frame) +{ + struct pl_avframe_priv *priv = frame->user_data; + if (priv->planar) { + pl_tex_destroy(gpu, &priv->planar); + for (int n = 0; n < frame->num_planes; n++) + frame->planes[n].texture = NULL; + } +} +#endif + +PL_LIBAV_API bool pl_map_avframe_ex(pl_gpu gpu, struct pl_frame *out, + const struct pl_avframe_params *params) +{ + const AVFrame *frame = params->frame; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); + struct pl_plane_data data[4] = {0}; + pl_tex *tex = params->tex; + int planes; + + struct pl_avframe_priv *priv = malloc(sizeof(*priv)); + if (!priv) + goto error; + + pl_frame_from_avframe(out, frame); + priv->avframe = av_frame_clone(frame); + out->user_data = priv; + +#ifdef PL_HAVE_LAV_DOLBY_VISION + if (params->map_dovi) { + AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DOVI_METADATA); + if (sd) { + const AVDOVIMetadata *metadata = (AVDOVIMetadata *) sd->data; + const AVDOVIRpuDataHeader *header = av_dovi_get_header(metadata); + // Only automatically map DoVi RPUs that don't require an EL + if (header->disable_residual_flag) + pl_frame_map_avdovi_metadata(out, &priv->dovi, metadata); + } + +#ifdef PL_HAVE_LIBDOVI + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DOVI_RPU_BUFFER); + if (sd) + pl_hdr_metadata_from_dovi_rpu(&out->color.hdr, sd->buf->data, sd->buf->size); +#endif // PL_HAVE_LIBDOVI + } + +#endif // PL_HAVE_LAV_DOLBY_VISION + + switch (frame->format) { + case AV_PIX_FMT_DRM_PRIME: + if (!pl_map_avframe_drm(gpu, out, frame)) + goto error; + return true; + + case AV_PIX_FMT_VAAPI: + if (!pl_map_avframe_derived(gpu, out, frame)) + goto error; + return true; + +#ifdef PL_HAVE_LAV_VULKAN + case AV_PIX_FMT_VULKAN: + if (!pl_map_avframe_vulkan(gpu, out, frame)) + goto error; + return true; +#endif + + default: break; + } + + // Backing textures are required from this point onwards + if (!tex) + goto error; + + planes = pl_plane_data_from_pixfmt(data, &out->repr.bits, frame->format); + if (!planes) + goto error; + + for (int p = 0; p < planes; p++) { + AVBufferRef *buf = av_frame_get_plane_buffer((AVFrame *) frame, p); + struct pl_avalloc *alloc = buf ? av_buffer_get_opaque(buf) : NULL; + bool is_chroma = p == 1 || p == 2; // matches lavu logic + + data[p].width = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0); + data[p].height = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0); + if (frame->linesize[p] < 0) { + data[p].pixels = frame->data[p] + frame->linesize[p] * (data[p].height - 1); + data[p].row_stride = -frame->linesize[p]; + out->planes[p].flipped = true; + } else { + data[p].pixels = frame->data[p]; + data[p].row_stride = frame->linesize[p]; + } + + // Probe for frames allocated by pl_get_buffer2 + if (alloc && alloc->magic[0] == PL_MAGIC0 && alloc->magic[1] == PL_MAGIC1) { + data[p].buf = alloc->buf; + data[p].buf_offset = (uintptr_t) data[p].pixels - (uintptr_t) alloc->buf->data; + data[p].pixels = NULL; + } else if (gpu->limits.callbacks) { + // Use asynchronous upload if possible + data[p].callback = pl_avframe_free_cb; + data[p].priv = av_frame_clone(frame); + } + + if (!pl_upload_plane(gpu, &out->planes[p], &tex[p], &data[p])) { + av_frame_free((AVFrame **) &data[p].priv); + goto error; + } + + out->planes[p].texture = tex[p]; + } + + return true; + +error: + pl_unmap_avframe(gpu, out); + return false; +} + +// Backwards compatibility with previous versions of this API. +PL_LIBAV_API bool pl_map_avframe(pl_gpu gpu, struct pl_frame *out_frame, + pl_tex tex[4], const AVFrame *avframe) +{ + return pl_map_avframe_ex(gpu, out_frame, &(struct pl_avframe_params) { + .frame = avframe, + .tex = tex, + }); +} + +PL_LIBAV_API void pl_unmap_avframe(pl_gpu gpu, struct pl_frame *frame) +{ + struct pl_avframe_priv *priv = frame->user_data; + const AVPixFmtDescriptor *desc; + if (!priv) + goto done; + +#ifdef PL_HAVE_LAV_VULKAN + if (priv->avframe->format == AV_PIX_FMT_VULKAN) + pl_unmap_avframe_vulkan(gpu, frame); +#endif + + desc = av_pix_fmt_desc_get(priv->avframe->format); + if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) { + for (int i = 0; i < 4; i++) + pl_tex_destroy(gpu, &frame->planes[i].texture); + } + + av_frame_free(&priv->avframe); + free(priv); + +done: + memset(frame, 0, sizeof(*frame)); // sanity +} + +PL_LIBAV_API AVFrame *pl_get_mapped_avframe(const struct pl_frame *frame) +{ + struct pl_avframe_priv *priv = frame->user_data; + return priv->avframe; +} + +static void pl_done_cb(void *priv) +{ + bool *status = priv; + *status = true; +} + +PL_LIBAV_API bool pl_download_avframe(pl_gpu gpu, + const struct pl_frame *frame, + AVFrame *out_frame) +{ + bool done[4] = {0}; + if (frame->num_planes != av_pix_fmt_count_planes(out_frame->format)) + return false; + + for (int p = 0; p < frame->num_planes; p++) { + bool ok = pl_tex_download(gpu, pl_tex_transfer_params( + .tex = frame->planes[p].texture, + .row_pitch = out_frame->linesize[p], + .ptr = out_frame->data[p], + // Use synchronous transfer for the last plane + .callback = (p+1) < frame->num_planes ? pl_done_cb : NULL, + .priv = &done[p], + )); + + if (!ok) + return false; + } + + for (int p = 0; p < frame->num_planes - 1; p++) { + while (!done[p]) + pl_tex_poll(gpu, frame->planes[p].texture, UINT64_MAX); + } + + return true; +} + +#define PL_DIV_UP(x, y) (((x) + (y) - 1) / (y)) +#define PL_ALIGN(x, align) ((align) ? PL_DIV_UP(x, align) * (align) : (x)) +#define PL_MAX(x, y) ((x) > (y) ? (x) : (y)) +#define PL_LCM(x, y) ((x) * ((y) / av_gcd(x, y))) + +static inline void pl_avalloc_free(void *opaque, uint8_t *data) +{ + struct pl_avalloc *alloc = opaque; + assert(alloc->magic[0] == PL_MAGIC0); + assert(alloc->magic[1] == PL_MAGIC1); + assert(alloc->buf->data == data); + pl_buf_destroy(alloc->gpu, &alloc->buf); + free(alloc); +} + +PL_LIBAV_API int pl_get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flags) +{ + int alignment[AV_NUM_DATA_POINTERS]; + int width = pic->width; + int height = pic->height; + size_t planesize[4]; + int ret = 0; + + pl_gpu *pgpu = avctx->opaque; + pl_gpu gpu = pgpu ? *pgpu : NULL; + struct pl_plane_data data[4]; + struct pl_avalloc *alloc; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pic->format); + int planes = pl_plane_data_from_pixfmt(data, NULL, pic->format); + + // Sanitize frame structs + memset(pic->data, 0, sizeof(pic->data)); + memset(pic->linesize, 0, sizeof(pic->linesize)); + memset(pic->buf, 0, sizeof(pic->buf)); + pic->extended_data = pic->data; + pic->extended_buf = NULL; + + if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1) || !planes) + goto fallback; + if (!gpu || !gpu->limits.thread_safe || !gpu->limits.max_mapped_size || + !gpu->limits.host_cached) + { + goto fallback; + } + + avcodec_align_dimensions2(avctx, &width, &height, alignment); + if ((ret = av_image_fill_linesizes(pic->linesize, pic->format, width))) + return ret; + + for (int p = 0; p < planes; p++) { + alignment[p] = PL_LCM(alignment[p], gpu->limits.align_tex_xfer_pitch); + alignment[p] = PL_LCM(alignment[p], gpu->limits.align_tex_xfer_offset); + alignment[p] = PL_LCM(alignment[p], data[p].pixel_stride); + pic->linesize[p] = PL_ALIGN(pic->linesize[p], alignment[p]); + } + +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 56, 100) + ret = av_image_fill_plane_sizes(planesize, pic->format, height, (ptrdiff_t[4]) { + pic->linesize[0], pic->linesize[1], pic->linesize[2], pic->linesize[3], + }); + if (ret < 0) + return ret; +#else + uint8_t *ptrs[4], * const base = (uint8_t *) 0x10000; + ret = av_image_fill_pointers(ptrs, pic->format, height, base, pic->linesize); + if (ret < 0) + return ret; + for (int p = 0; p < 4; p++) + planesize[p] = (uintptr_t) ptrs[p] - (uintptr_t) base; +#endif + + for (int p = 0; p < planes; p++) { + const size_t buf_size = planesize[p] + alignment[p]; + if (buf_size > gpu->limits.max_mapped_size) { + av_frame_unref(pic); + goto fallback; + } + + alloc = malloc(sizeof(*alloc)); + if (!alloc) { + av_frame_unref(pic); + return AVERROR(ENOMEM); + } + + *alloc = (struct pl_avalloc) { + .magic = { PL_MAGIC0, PL_MAGIC1 }, + .gpu = gpu, + .buf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .memory_type = PL_BUF_MEM_HOST, + .host_mapped = true, + .storable = desc->flags & AV_PIX_FMT_FLAG_BE, + )), + }; + + if (!alloc->buf) { + free(alloc); + av_frame_unref(pic); + return AVERROR(ENOMEM); + } + + pic->data[p] = (uint8_t *) PL_ALIGN((uintptr_t) alloc->buf->data, alignment[p]); + pic->buf[p] = av_buffer_create(alloc->buf->data, buf_size, pl_avalloc_free, alloc, 0); + if (!pic->buf[p]) { + pl_buf_destroy(gpu, &alloc->buf); + free(alloc); + av_frame_unref(pic); + return AVERROR(ENOMEM); + } + } + + return 0; + +fallback: + return avcodec_default_get_buffer2(avctx, pic, flags); +} + +#undef PL_MAGIC0 +#undef PL_MAGIC1 +#undef PL_ALIGN +#undef PL_MAX + +#endif // LIBPLACEBO_LIBAV_H_ diff --git a/src/include/libplacebo/utils/upload.h b/src/include/libplacebo/utils/upload.h new file mode 100644 index 0000000..9e8d436 --- /dev/null +++ b/src/include/libplacebo/utils/upload.h @@ -0,0 +1,153 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_UPLOAD_H_ +#define LIBPLACEBO_UPLOAD_H_ + +#include <stdint.h> + +#include <libplacebo/gpu.h> +#include <libplacebo/renderer.h> + +PL_API_BEGIN + +// This file contains a utility function to assist in uploading data from host +// memory to a texture. In particular, the texture will be suitable for use as +// a `pl_plane`. + +// Description of the host representation of an image plane +struct pl_plane_data { + enum pl_fmt_type type; // meaning of the data (must not be UINT or SINT) + int width, height; // dimensions of the plane + int component_size[4]; // size in bits of each coordinate + int component_pad[4]; // ignored bits preceding each component + int component_map[4]; // semantic meaning of each component (pixel order) + size_t pixel_stride; // offset in bytes between pixels (required) + size_t row_stride; // offset in bytes between rows (optional) + bool swapped; // pixel data is endian-swapped (non-native) + + // Similar to `pl_tex_transfer_params`, you can either upload from a raw + // pointer address, or a buffer + offset. Again, the use of these two + // mechanisms is mutually exclusive. + // + // 1. Uploading from host memory + const void *pixels; // the actual data underlying this plane + + // 2. Uploading from a buffer (requires `pl_gpu_limits.buf_transfer`) + pl_buf buf; // the buffer to use + size_t buf_offset; // offset of data within buffer, must be a + // multiple of `pixel_stride` as well as of 4 + + // Similar to `pl_tex_transfer_params.callback`, this allows turning the + // upload of a plane into an asynchronous upload. The same notes apply. + void (*callback)(void *priv); + void *priv; + + // Note: When using this together with `pl_frame`, there is some amount of + // overlap between `component_pad` and `pl_color_repr.bits`. Some key + // differences between the two: + // + // - the bits from `component_pad` are ignored; whereas the superfluous bits + // in a `pl_color_repr` must be 0. + // - the `component_pad` exists to align the component size and placement + // with the capabilities of GPUs; the `pl_color_repr` exists to control + // the semantics of the color samples on a finer granularity. + // - the `pl_color_repr` applies to the color sample as a whole, and + // therefore applies to all planes; the `component_pad` can be different + // for each plane. + // - `component_pad` interacts with float textures by moving the actual + // float in memory. `pl_color_repr` interacts with float data as if + // the float was converted from an integer under full range semantics. + // + // To help establish the motivating difference, a typical example of a use + // case would be yuv420p10. Since 10-bit GPU texture support is limited, + // and working with non-byte-aligned pixels is awkward in general, the + // convention is to represent yuv420p10 as 16-bit samples with either the + // high or low bits set to 0. In this scenario, the `component_size` of the + // `pl_plane_data` and `pl_bit_encoding.sample_depth` would be 16, while + // the `pl_bit_encoding.color_depth` would be 10 (and additionally, the + // `pl_bit_encoding.bit_shift` would be either 0 or 6, depending on + // whether the low or the high bits are used). + // + // On the contrary, something like a packed, 8-bit XBGR format (where the + // X bits are ignored and may contain garbage) would set `component_pad[0]` + // to 8, and the component_size[0:2] (respectively) to 8 as well. + // + // As a general rule of thumb, for maximum compatibility, you should try + // and align component_size/component_pad to multiples of 8 and explicitly + // clear any remaining superfluous bits (+ use `pl_color_repr.bits` to + // ensure they're decoded correctly). You should also try to align the + // `pixel_stride` to a power of two. +}; + +// Fills in the `component_size`, `component_pad` and `component_map` fields +// based on the supplied mask for each component (in semantic order, i.e. +// RGBA). Each element of `mask` must have a contiguous range of set bits. +PL_API void pl_plane_data_from_mask(struct pl_plane_data *data, uint64_t mask[4]); + +// Fills in the `component_size`, `component_pad` and `component_map` fields +// based on the supplied sizes (in bits) and shift of each component (in +// semantic order). +// +// Similar to `pl_plane_data_from_mask` but not limited to 64-bit pixels. +PL_API void pl_plane_data_from_comps(struct pl_plane_data *data, int size[4], + int shift[4]); + +// Helper function to take a `pl_plane_data` struct and try and improve its +// alignment to make it more likely to correspond to a real `pl_fmt`. It does +// this by attempting to round each component up to the nearest byte boundary. +// This relies on the assumption (true in practice) that superfluous bits of +// byte-misaligned formats are explicitly set to 0. +// +// The resulting shift must be consistent across all components, in which case +// it's returned in `out_bits`. If no alignment was possible, `out_bits` is set +// to {0}, and this function returns false. +PL_API bool pl_plane_data_align(struct pl_plane_data *data, struct pl_bit_encoding *out_bits); + +// Helper function to find a suitable `pl_fmt` based on a pl_plane_data's +// requirements. This is called internally by `pl_upload_plane`, but it's +// exposed to users both as a convenience and so they may pre-emptively check +// if a format would be supported without actually having to attempt the upload. +PL_API pl_fmt pl_plane_find_fmt(pl_gpu gpu, int out_map[4], const struct pl_plane_data *data); + +// Upload an image plane to a texture, and output the resulting `pl_plane` +// struct to `out_plane` (optional). `tex` must be a valid pointer to a texture +// (or NULL), which will be destroyed and reinitialized if it does not already +// exist or is incompatible. Returns whether successful. +// +// The resulting texture is guaranteed to be `sampleable`, and it will also try +// and maximize compatibility with the other `pl_renderer` requirements +// (blittable, linear filterable, etc.). +// +// Note: `out_plane->shift_x/y` and `out_plane->flipped` are left +// uninitialized, and should be set explicitly by the user. +PL_API bool pl_upload_plane(pl_gpu gpu, struct pl_plane *out_plane, + pl_tex *tex, const struct pl_plane_data *data); + +// Like `pl_upload_plane`, but only creates an uninitialized texture object +// rather than actually performing an upload. This can be useful to, for +// example, prepare textures to be used as the target of rendering. +// +// The resulting texture is guaranteed to be `renderable`, and it will also try +// to maximize compatibility with the other `pl_renderer` requirements +// (blittable, storable, etc.). +PL_API bool pl_recreate_plane(pl_gpu gpu, struct pl_plane *out_plane, + pl_tex *tex, const struct pl_plane_data *data); + +PL_API_END + +#endif // LIBPLACEBO_UPLOAD_H_ diff --git a/src/include/libplacebo/vulkan.h b/src/include/libplacebo/vulkan.h new file mode 100644 index 0000000..4e5db95 --- /dev/null +++ b/src/include/libplacebo/vulkan.h @@ -0,0 +1,638 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBPLACEBO_VULKAN_H_ +#define LIBPLACEBO_VULKAN_H_ + +#include <vulkan/vulkan.h> +#include <libplacebo/gpu.h> +#include <libplacebo/swapchain.h> + +PL_API_BEGIN + +#define PL_VK_MIN_VERSION VK_API_VERSION_1_2 + +// Structure representing a VkInstance. Using this is not required. +typedef const struct pl_vk_inst_t { + VkInstance instance; + + // The Vulkan API version supported by this VkInstance. + uint32_t api_version; + + // The associated vkGetInstanceProcAddr pointer. + PFN_vkGetInstanceProcAddr get_proc_addr; + + // The instance extensions that were successfully enabled, including + // extensions enabled by libplacebo internally. May contain duplicates. + const char * const *extensions; + int num_extensions; + + // The instance layers that were successfully enabled, including + // layers enabled by libplacebo internally. May contain duplicates. + const char * const *layers; + int num_layers; +} *pl_vk_inst; + +struct pl_vk_inst_params { + // If set, enable the debugging and validation layers. These should + // generally be lightweight and relatively harmless to enable. + bool debug; + + // If set, also enable GPU-assisted verification and best practices + // layers. (Note: May cause substantial slowdown and/or result in lots of + // false positive spam) + bool debug_extra; + + // If nonzero, restricts the Vulkan API version to be at most this. This + // is only really useful for explicitly testing backwards compatibility. + uint32_t max_api_version; + + // Pointer to a user-provided `vkGetInstanceProcAddr`. If this is NULL, + // libplacebo will use the directly linked version (if available). + PFN_vkGetInstanceProcAddr get_proc_addr; + + // Enables extra instance extensions. Instance creation will fail if these + // extensions are not all supported. The user may use this to enable e.g. + // windowing system integration. + const char * const *extensions; + int num_extensions; + + // Enables extra optional instance extensions. These are opportunistically + // enabled if supported by the device, but otherwise skipped. + const char * const *opt_extensions; + int num_opt_extensions; + + // Enables extra layers. Instance creation will fail if these layers are + // not all supported. + // + // NOTE: Layers needed for required/optional extensions are automatically + // enabled. The user does not specifically need to enable layers related + // to extension support. + const char * const *layers; + int num_layers; + + // Enables extra optional layers. These are opportunistically enabled if + // supported by the platform, but otherwise skipped. + const char * const *opt_layers; + int num_opt_layers; +}; + +#define pl_vk_inst_params(...) (&(struct pl_vk_inst_params) { __VA_ARGS__ }) +PL_API extern const struct pl_vk_inst_params pl_vk_inst_default_params; + +// Helper function to simplify instance creation. The user could also bypass +// these helpers and do it manually, but this function is provided as a +// convenience. It also sets up a debug callback which forwards all vulkan +// messages to the `pl_log` callback. +PL_API pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params); +PL_API void pl_vk_inst_destroy(pl_vk_inst *inst); + +struct pl_vulkan_queue { + uint32_t index; // Queue family index + uint32_t count; // Queue family count +}; + +// Structure representing the actual vulkan device and associated GPU instance +typedef const struct pl_vulkan_t *pl_vulkan; +struct pl_vulkan_t { + pl_gpu gpu; + + // The vulkan objects in use. The user may use this for their own purposes, + // but please note that the lifetime is tied to the lifetime of the + // pl_vulkan object, and must not be destroyed by the user. Note that the + // created vulkan device may have any number of queues and queue family + // assignments; so using it for queue submission commands is ill-advised. + VkInstance instance; + VkPhysicalDevice phys_device; + VkDevice device; + + // The associated vkGetInstanceProcAddr pointer. + PFN_vkGetInstanceProcAddr get_proc_addr; + + // The Vulkan API version supported by this VkPhysicalDevice. + uint32_t api_version; + + // The device extensions that were successfully enabled, including + // extensions enabled by libplacebo internally. May contain duplicates. + const char * const *extensions; + int num_extensions; + + // The device features that were enabled at device creation time. + // + // Note: Whenever a feature flag is ambiguious between several alternative + // locations, for completeness' sake, we include both. + const VkPhysicalDeviceFeatures2 *features; + + // The explicit queue families we are using to provide a given capability. + struct pl_vulkan_queue queue_graphics; // provides VK_QUEUE_GRAPHICS_BIT + struct pl_vulkan_queue queue_compute; // provides VK_QUEUE_COMPUTE_BIT + struct pl_vulkan_queue queue_transfer; // provides VK_QUEUE_TRANSFER_BIT + + // Functions for locking a queue. These must be used to lock VkQueues for + // submission or other related operations when sharing the VkDevice between + // multiple threads, Using this on queue families or indices not contained + // in `queues` is undefined behavior. + void (*lock_queue)(pl_vulkan vk, uint32_t qf, uint32_t qidx); + void (*unlock_queue)(pl_vulkan vk, uint32_t qf, uint32_t qidx); + + // --- Deprecated fields + + // These are the same active queue families and their queue counts in list + // form. This list does not contain duplicates, nor any extra queues + // enabled at device creation time. Deprecated in favor of querying + // `vkGetPhysicalDeviceQueueFamilyProperties` directly. + const struct pl_vulkan_queue *queues PL_DEPRECATED; + int num_queues PL_DEPRECATED; +}; + +struct pl_vulkan_params { + // The vulkan instance. Optional, if NULL then libplacebo will internally + // create a VkInstance with the settings from `instance_params`. + // + // Note: The VkInstance provided by the user *MUST* be created with a + // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher. + VkInstance instance; + + // Pointer to `vkGetInstanceProcAddr`. If this is NULL, libplacebo will + // use the directly linked version (if available). + // + // Note: This overwrites the same value from `instance_params`. + PFN_vkGetInstanceProcAddr get_proc_addr; + + // Configures the settings used for creating an internal vulkan instance. + // May be NULL. Ignored if `instance` is set. + const struct pl_vk_inst_params *instance_params; + + // When choosing the device, rule out all devices that don't support + // presenting to this surface. When creating a device, enable all extensions + // needed to ensure we can present to this surface. Optional. Only legal + // when specifying an existing VkInstance to use. + VkSurfaceKHR surface; + + // --- Physical device selection options + + // The vulkan physical device. May be set by the caller to indicate the + // physical device to use. Otherwise, libplacebo will pick the "best" + // available GPU, based on the advertised device type. (i.e., it will + // prefer discrete GPUs over integrated GPUs). Only legal when specifying + // an existing VkInstance to use. + VkPhysicalDevice device; + + // When choosing the device, only choose a device with this exact name. + // This overrides `allow_software`. No effect if `device` is set. Note: A + // list of devices and their names are logged at level PL_LOG_INFO. + const char *device_name; + + // When choosing the device, only choose a device with this exact UUID. + // This overrides `allow_software` and `device_name`. No effect if `device` + // is set. + uint8_t device_uuid[16]; + + // When choosing the device, controls whether or not to also allow software + // GPUs. No effect if `device` or `device_name` are set. + bool allow_software; + + // --- Logical device creation options + + // Controls whether or not to allow asynchronous transfers, using transfer + // queue families, if supported by the device. This can be significantly + // faster and more power efficient, and also allows streaming uploads in + // parallel with rendering commands. Enabled by default. + bool async_transfer; + + // Controls whether or not to allow asynchronous compute, using dedicated + // compute queue families, if supported by the device. On some devices, + // these can allow the GPU to schedule compute shaders in parallel with + // fragment shaders. Enabled by default. + bool async_compute; + + // Limits the number of queues to use. If left as 0, libplacebo will use as + // many queues as the device supports. Multiple queues can result in + // improved efficiency when submitting multiple commands that can entirely + // or partially execute in parallel. Defaults to 1, since using more queues + // can actually decrease performance. + // + // Note: libplacebo will always *create* logical devices with all available + // queues for a given QF enabled, regardless of this setting. + int queue_count; + + // Bitmask of extra queue families to enable. If set, then *all* queue + // families matching *any* of these flags will be enabled at device + // creation time. Setting this to VK_QUEUE_FLAG_BITS_MAX_ENUM effectively + // enables all queue families supported by the device. + VkQueueFlags extra_queues; + + // Enables extra device extensions. Device creation will fail if these + // extensions are not all supported. The user may use this to enable e.g. + // interop extensions. + const char * const *extensions; + int num_extensions; + + // Enables extra optional device extensions. These are opportunistically + // enabled if supported by the device, but otherwise skipped. + const char * const *opt_extensions; + int num_opt_extensions; + + // Optional extra features to enable at device creation time. These are + // opportunistically enabled if supported by the physical device, but + // otherwise kept disabled. + const VkPhysicalDeviceFeatures2 *features; + + // --- Misc/debugging options + + // Restrict specific features to e.g. work around driver bugs, or simply + // for testing purposes + int max_glsl_version; // limit the maximum GLSL version + uint32_t max_api_version; // limit the maximum vulkan API version +}; + +// Default/recommended parameters. Should generally be safe and efficient. +#define PL_VULKAN_DEFAULTS \ + .async_transfer = true, \ + .async_compute = true, \ + /* enabling multiple queues often decreases perf */ \ + .queue_count = 1, + +#define pl_vulkan_params(...) (&(struct pl_vulkan_params) { PL_VULKAN_DEFAULTS __VA_ARGS__ }) +PL_API extern const struct pl_vulkan_params pl_vulkan_default_params; + +// Creates a new vulkan device based on the given parameters and initializes +// a new GPU. If `params` is left as NULL, it defaults to +// &pl_vulkan_default_params. +// +// Thread-safety: Safe +PL_API pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params); + +// Destroys the vulkan device and all associated objects, except for the +// VkInstance provided by the user. +// +// Note that all resources allocated from this vulkan object (e.g. via the +// `vk->ra` or using `pl_vulkan_create_swapchain`) *must* be explicitly +// destroyed by the user before calling this. +// +// Also note that this function will block until all in-flight GPU commands are +// finished processing. You can avoid this by manually calling `pl_gpu_finish` +// before `pl_vulkan_destroy`. +PL_API void pl_vulkan_destroy(pl_vulkan *vk); + +// For a `pl_gpu` backed by `pl_vulkan`, this function can be used to retrieve +// the underlying `pl_vulkan`. Returns NULL for any other type of `gpu`. +PL_API pl_vulkan pl_vulkan_get(pl_gpu gpu); + +struct pl_vulkan_device_params { + // The instance to use. Required! + // + // Note: The VkInstance provided by the user *must* be created with a + // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher. + VkInstance instance; + + // Mirrored from `pl_vulkan_params`. All of these fields are optional. + PFN_vkGetInstanceProcAddr get_proc_addr; + VkSurfaceKHR surface; + const char *device_name; + uint8_t device_uuid[16]; + bool allow_software; +}; + +#define pl_vulkan_device_params(...) (&(struct pl_vulkan_device_params) { __VA_ARGS__ }) + +// Helper function to choose the best VkPhysicalDevice, given a VkInstance. +// This uses the same logic as `pl_vulkan_create` uses internally. If no +// matching device was found, this returns VK_NULL_HANDLE. +PL_API VkPhysicalDevice pl_vulkan_choose_device(pl_log log, + const struct pl_vulkan_device_params *params); + +struct pl_vulkan_swapchain_params { + // The surface to use for rendering. Required, the user is in charge of + // creating this. Must belong to the same VkInstance as `vk->instance`. + VkSurfaceKHR surface; + + // The preferred presentation mode. See the vulkan documentation for more + // information about these. If the device/surface combination does not + // support this mode, libplacebo will fall back to VK_PRESENT_MODE_FIFO_KHR. + // + // Warning: Leaving this zero-initialized is the same as having specified + // VK_PRESENT_MODE_IMMEDIATE_KHR, which is probably not what the user + // wants! + VkPresentModeKHR present_mode; + + // Allow up to N in-flight frames. This essentially controls how many + // rendering commands may be queued up at the same time. See the + // documentation for `pl_swapchain_get_latency` for more information. For + // vulkan specifically, we are only able to wait until the GPU has finished + // rendering a frame - we are unable to wait until the display has actually + // finished displaying it. So this only provides a rough guideline. + // Optional, defaults to 3. + int swapchain_depth; + + // This suppresses automatic recreation of the swapchain when any call + // returns VK_SUBOPTIMAL_KHR. Normally, libplacebo will recreate the + // swapchain internally on the next `pl_swapchain_start_frame`. If enabled, + // clients are assumed to take care of swapchain recreations themselves, by + // calling `pl_swapchain_resize` as appropriate. libplacebo will tolerate + // the "suboptimal" status indefinitely. + bool allow_suboptimal; + + // Disable high-bit (10 or more) SDR formats. May help work around buggy + // drivers which don't dither properly when outputting high bit depth + // SDR backbuffers to 8-bit screens. + bool disable_10bit_sdr; +}; + +#define pl_vulkan_swapchain_params(...) (&(struct pl_vulkan_swapchain_params) { __VA_ARGS__ }) + +// Creates a new vulkan swapchain based on an existing VkSurfaceKHR. Using this +// function requires that the vulkan device was created with the +// VK_KHR_swapchain extension. The easiest way of accomplishing this is to set +// the `pl_vulkan_params.surface` explicitly at creation time. +PL_API pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk, + const struct pl_vulkan_swapchain_params *params); + +// This will return true if the vulkan swapchain is internally detected +// as being suboptimal (VK_SUBOPTIMAL_KHR). This might be of use to clients +// who have `params->allow_suboptimal` enabled. +PL_API bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw); + +// Vulkan interop API, for sharing a single VkDevice (and associated vulkan +// resources) directly with the API user. The use of this API is a bit sketchy +// and requires careful communication of Vulkan API state. + +struct pl_vulkan_import_params { + // The vulkan instance. Required. + // + // Note: The VkInstance provided by the user *must* be created with a + // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher. + VkInstance instance; + + // Pointer to `vkGetInstanceProcAddr`. If this is NULL, libplacebo will + // use the directly linked version (if available). + PFN_vkGetInstanceProcAddr get_proc_addr; + + // The physical device selected by the user. Required. + VkPhysicalDevice phys_device; + + // The logical device created by the user. Required. + VkDevice device; + + // --- Logical device parameters + + // List of all device-level extensions that were enabled. (Instance-level + // extensions need not be re-specified here, since it's guaranteed that any + // instance-level extensions that device-level extensions depend on were + // enabled at the instance level) + const char * const *extensions; + int num_extensions; + + // Enabled queue families. At least `queue_graphics` is required. + // + // It's okay for multiple queue families to be specified with the same + // index, e.g. in the event that a dedicated compute queue also happens to + // be the dedicated transfer queue. + // + // It's also okay to leave the queue struct as {0} in the event that no + // dedicated queue exists for a given operation type. libplacebo will + // automatically fall back to using e.g. the graphics queue instead. + struct pl_vulkan_queue queue_graphics; // must support VK_QUEUE_GRAPHICS_BIT + struct pl_vulkan_queue queue_compute; // must support VK_QUEUE_COMPUTE_BIT + struct pl_vulkan_queue queue_transfer; // must support VK_QUEUE_TRANSFER_BIT + + // Enabled VkPhysicalDeviceFeatures. The device *must* be created with + // all of the features in `pl_vulkan_required_features` enabled. + const VkPhysicalDeviceFeatures2 *features; + + // Functions for locking a queue. If set, these will be used instead of + // libplacebo's internal functions for `pl_vulkan.(un)lock_queue`. + void (*lock_queue)(void *ctx, uint32_t qf, uint32_t qidx); + void (*unlock_queue)(void *ctx, uint32_t qf, uint32_t qidx); + void *queue_ctx; + + // --- Misc/debugging options + + // Restrict specific features to e.g. work around driver bugs, or simply + // for testing purposes. See `pl_vulkan_params` for a description of these. + int max_glsl_version; + uint32_t max_api_version; +}; + +#define pl_vulkan_import_params(...) (&(struct pl_vulkan_import_params) { __VA_ARGS__ }) + +// For purely informative reasons, this contains a list of extensions and +// device features that libplacebo *can* make use of. These are all strictly +// optional, but provide a hint to the API user as to what might be worth +// enabling at device creation time. +// +// Note: This also includes physical device features provided by extensions. +// They are all provided using extension-specific features structs, rather +// than the more general purpose VkPhysicalDeviceVulkan11Features etc. +PL_API extern const char * const pl_vulkan_recommended_extensions[]; +PL_API extern const int pl_vulkan_num_recommended_extensions; +PL_API extern const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features; + +// A list of device features that are required by libplacebo. These +// *must* be provided by imported Vulkan devices. +// +// Note: `pl_vulkan_recommended_features` does not include this list. +PL_API extern const VkPhysicalDeviceFeatures2 pl_vulkan_required_features; + +// Import an existing VkDevice instead of creating a new one, and wrap it into +// a `pl_vulkan` abstraction. It's safe to `pl_vulkan_destroy` this, which will +// destroy application state related to libplacebo but leave the underlying +// VkDevice intact. +PL_API pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params); + +struct pl_vulkan_wrap_params { + // The image itself. It *must* be usable concurrently by all of the queue + // family indices listed in `pl_vulkan->queues`. Note that this requires + // the use of VK_SHARING_MODE_CONCURRENT if `pl_vulkan->num_queues` is + // greater than 1. If this is difficult to achieve for the user, then + // `async_transfer` / `async_compute` should be turned off, which + // guarantees the use of only one queue family. + VkImage image; + + // Which aspect of `image` to wrap. Only useful for wrapping individual + // sub-planes of planar images. If left as 0, it defaults to the entire + // image (i.e. the union of VK_IMAGE_ASPECT_PLANE_N_BIT for planar formats, + // and VK_IMAGE_ASPECT_COLOR_BIT otherwise). + VkImageAspectFlags aspect; + + // The image's dimensions (unused dimensions must be 0) + int width; + int height; + int depth; + + // The image's format. libplacebo will try to map this to an equivalent + // pl_fmt. If no compatible pl_fmt is found, wrapping will fail. + VkFormat format; + + // The usage flags the image was created with. libplacebo will set the + // pl_tex capabilities to include whatever it can, as determined by the set + // of enabled usage flags. + VkImageUsageFlags usage; + + // See `pl_tex_params` + void *user_data; + pl_debug_tag debug_tag; +}; + +#define pl_vulkan_wrap_params(...) (&(struct pl_vulkan_wrap_params) { \ + .debug_tag = PL_DEBUG_TAG, \ + __VA_ARGS__ \ + }) + +// Wraps an external VkImage into a pl_tex abstraction. By default, the image +// is considered "held" by the user and must be released before calling any +// pl_tex_* API calls on it (see `pl_vulkan_release`). +// +// This wrapper can be destroyed by simply calling `pl_tex_destroy` on it, +// which will not destroy the underlying VkImage. If a pl_tex wrapper is +// destroyed while an image is not currently being held by the user, that +// image is left in an undefined state. +// +// Wrapping the same VkImage multiple times is undefined behavior, as is trying +// to wrap an image belonging to a different VkDevice than the one in use by +// `gpu`. +// +// This function may fail, in which case it returns NULL. +PL_API pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params); + +// Analogous to `pl_vulkan_wrap`, this function takes any `pl_tex` (including +// ones created by `pl_tex_create`) and unwraps it to expose the underlying +// VkImage to the user. Unlike `pl_vulkan_wrap`, this `pl_tex` is *not* +// considered held after calling this function - the user must explicitly +// `pl_vulkan_hold` before accessing the VkImage. +// +// `out_format` and `out_flags` will be updated to hold the VkImage's +// format and usage flags. (Optional) +PL_API VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, + VkFormat *out_format, VkImageUsageFlags *out_flags); + +// Represents a vulkan semaphore/value pair (for compatibility with timeline +// semaphores). When using normal, binary semaphores, `value` may be ignored. +typedef struct pl_vulkan_sem { + VkSemaphore sem; + uint64_t value; +} pl_vulkan_sem; + +struct pl_vulkan_hold_params { + // The Vulkan image to hold. It will be marked as held. Attempting to + // perform any pl_tex_* operation (except pl_tex_destroy) on a held image + // is undefined behavior. + pl_tex tex; + + // The layout to transition the image to when holding. Alternatively, a + // pointer to receive the current image layout. If `out_layout` is + // provided, `layout` is ignored. + VkImageLayout layout; + VkImageLayout *out_layout; + + // The queue family index to transition the image to. This can be used with + // VK_QUEUE_FAMILY_EXTERNAL to transition the image to an external API. As + // a special case, if set to VK_QUEUE_FAMILY_IGNORED, libplacebo will not + // transition the image, even if this image was not set up for concurrent + // usage. Ignored for concurrent images. + uint32_t qf; + + // The semaphore to fire when the image is available for use. (Required) + pl_vulkan_sem semaphore; +}; + +#define pl_vulkan_hold_params(...) (&(struct pl_vulkan_hold_params) { __VA_ARGS__ }) + +// "Hold" a shared image, transferring control over the image to the user. +// Returns whether successful. +PL_API bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params); + +struct pl_vulkan_release_params { + // The image to be released. It must be marked as "held". Performing any + // operation on the VkImage underlying this `pl_tex` while it is not being + // held by the user is undefined behavior. + pl_tex tex; + + // The current layout of the image at the point in time when `semaphore` + // fires, or if no semaphore is specified, at the time of call. + VkImageLayout layout; + + // The queue family index to transition the image to. This can be used with + // VK_QUEUE_FAMILY_EXTERNAL to transition the image rom an external API. As + // a special case, if set to VK_QUEUE_FAMILY_IGNORED, libplacebo will not + // transition the image, even if this image was not set up for concurrent + // usage. Ignored for concurrent images. + uint32_t qf; + + // The semaphore to wait on before libplacebo will actually use or modify + // the image. (Optional) + // + // Note: the lifetime of `semaphore` is indeterminate, and destroying it + // while the texture is still depending on that semaphore is undefined + // behavior. + // + // Technically, the only way to be sure that it's safe to free is to use + // `pl_gpu_finish()` or similar (e.g. `pl_vulkan_destroy` or + // `vkDeviceWaitIdle`) after another operation involving `tex` has been + // emitted (or the texture has been destroyed). + // + // + // Warning: If `tex` is a planar image (`pl_fmt.num_planes > 0`), and + // `semaphore` is specified, it *must* be a timeline semaphore! Failure to + // respect this will result in undefined behavior. This warning does not + // apply to individual planes (as exposed by `pl_tex.planes`). + pl_vulkan_sem semaphore; +}; + +#define pl_vulkan_release_params(...) (&(struct pl_vulkan_release_params) { __VA_ARGS__ }) + +// "Release" a shared image, transferring control to libplacebo. +PL_API void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params); + +struct pl_vulkan_sem_params { + // The type of semaphore to create. + VkSemaphoreType type; + + // For VK_SEMAPHORE_TYPE_TIMELINE, sets the initial timeline value. + uint64_t initial_value; + + // If set, exports this VkSemaphore to the handle given in `out_handle`. + // The user takes over ownership, and should manually close it before + // destroying this VkSemaphore (via `pl_vulkan_sem_destroy`). + enum pl_handle_type export_handle; + union pl_handle *out_handle; + + // Optional debug tag to identify this semaphore. + pl_debug_tag debug_tag; +}; + +#define pl_vulkan_sem_params(...) (&(struct pl_vulkan_sem_params) { \ + .debug_tag = PL_DEBUG_TAG, \ + __VA_ARGS__ \ + }) + +// Helper functions to create and destroy vulkan semaphores. Returns +// VK_NULL_HANDLE on failure. +PL_API VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params); +PL_API void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore); + +// Backwards-compatibility wrappers for older versions of the API. +PL_DEPRECATED PL_API bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout, + pl_vulkan_sem sem_out); +PL_DEPRECATED PL_API bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex, VkImageLayout *out_layout, + pl_vulkan_sem sem_out); +PL_DEPRECATED PL_API void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout, + pl_vulkan_sem sem_in); + +PL_API_END + +#endif // LIBPLACEBO_VULKAN_H_ diff --git a/src/log.c b/src/log.c new file mode 100644 index 0000000..0829dd3 --- /dev/null +++ b/src/log.c @@ -0,0 +1,471 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <locale.h> + +#include "common.h" +#include "log.h" +#include "pl_thread.h" + +struct priv { + pl_mutex lock; + enum pl_log_level log_level_cap; + pl_str logbuffer; +}; + +pl_log pl_log_create(int api_ver, const struct pl_log_params *params) +{ + (void) api_ver; + struct pl_log_t *log = pl_zalloc_obj(NULL, log, struct priv); + struct priv *p = PL_PRIV(log); + log->params = *PL_DEF(params, &pl_log_default_params); + pl_mutex_init(&p->lock); + pl_info(log, "Initialized libplacebo %s (API v%d)", PL_VERSION, PL_API_VER); + return log; +} + +const struct pl_log_params pl_log_default_params = {0}; + +void pl_log_destroy(pl_log *plog) +{ + pl_log log = *plog; + if (!log) + return; + + struct priv *p = PL_PRIV(log); + pl_mutex_destroy(&p->lock); + pl_free((void *) log); + *plog = NULL; +} + +struct pl_log_params pl_log_update(pl_log ptr, const struct pl_log_params *params) +{ + struct pl_log_t *log = (struct pl_log_t *) ptr; + if (!log) + return pl_log_default_params; + + struct priv *p = PL_PRIV(log); + pl_mutex_lock(&p->lock); + struct pl_log_params prev_params = log->params; + log->params = *PL_DEF(params, &pl_log_default_params); + pl_mutex_unlock(&p->lock); + + return prev_params; +} + +enum pl_log_level pl_log_level_update(pl_log ptr, enum pl_log_level level) +{ + struct pl_log_t *log = (struct pl_log_t *) ptr; + if (!log) + return PL_LOG_NONE; + + struct priv *p = PL_PRIV(log); + pl_mutex_lock(&p->lock); + enum pl_log_level prev_level = log->params.log_level; + log->params.log_level = level; + pl_mutex_unlock(&p->lock); + + return prev_level; +} + +void pl_log_level_cap(pl_log log, enum pl_log_level cap) +{ + if (!log) + return; + + struct priv *p = PL_PRIV(log); + pl_mutex_lock(&p->lock); + p->log_level_cap = cap; + pl_mutex_unlock(&p->lock); +} + +static FILE *default_stream(void *stream, enum pl_log_level level) +{ + return PL_DEF(stream, level <= PL_LOG_WARN ? stderr : stdout); +} + +void pl_log_simple(void *stream, enum pl_log_level level, const char *msg) +{ + static const char *prefix[] = { + [PL_LOG_FATAL] = "fatal", + [PL_LOG_ERR] = "error", + [PL_LOG_WARN] = "warn", + [PL_LOG_INFO] = "info", + [PL_LOG_DEBUG] = "debug", + [PL_LOG_TRACE] = "trace", + }; + + FILE *h = default_stream(stream, level); + fprintf(h, "%5s: %s\n", prefix[level], msg); + if (level <= PL_LOG_WARN) + fflush(h); +} + +void pl_log_color(void *stream, enum pl_log_level level, const char *msg) +{ + static const char *color[] = { + [PL_LOG_FATAL] = "31;1", // bright red + [PL_LOG_ERR] = "31", // red + [PL_LOG_WARN] = "33", // yellow/orange + [PL_LOG_INFO] = "32", // green + [PL_LOG_DEBUG] = "34", // blue + [PL_LOG_TRACE] = "30;1", // bright black + }; + + FILE *h = default_stream(stream, level); + fprintf(h, "\033[%sm%s\033[0m\n", color[level], msg); + if (level <= PL_LOG_WARN) + fflush(h); +} + +static void pl_msg_va(pl_log log, enum pl_log_level lev, + const char *fmt, va_list va) +{ + // Test log message without taking the lock, to avoid thrashing the + // lock for thousands of trace messages unless those are actually + // enabled. This may be a false negative, in which case log messages may + // be lost as a result. But this shouldn't be a big deal, since any + // situation leading to lost log messages would itself be a race condition. + if (!pl_msg_test(log, lev)) + return; + + // Re-test the log message level with held lock to avoid false positives, + // which would be a considerably bigger deal than false negatives + struct priv *p = PL_PRIV(log); + pl_mutex_lock(&p->lock); + + // Apply this cap before re-testing the log level, to avoid giving users + // messages that should have been dropped by the log level. + lev = PL_MAX(lev, p->log_level_cap); + if (!pl_msg_test(log, lev)) + goto done; + + p->logbuffer.len = 0; + pl_str_append_vasprintf((void *) log, &p->logbuffer, fmt, va); + log->params.log_cb(log->params.log_priv, lev, (char *) p->logbuffer.buf); + +done: + pl_mutex_unlock(&p->lock); +} + +void pl_msg(pl_log log, enum pl_log_level lev, const char *fmt, ...) +{ + va_list va; + va_start(va, fmt); + pl_msg_va(log, lev, fmt, va); + va_end(va); +} + +void pl_msg_source(pl_log log, enum pl_log_level lev, const char *src) +{ + if (!pl_msg_test(log, lev) || !src) + return; + + int line = 1; + while (*src) { + const char *end = strchr(src, '\n'); + if (!end) { + pl_msg(log, lev, "[%3d] %s", line, src); + break; + } + + pl_msg(log, lev, "[%3d] %.*s", line, (int)(end - src), src); + src = end + 1; + line++; + } +} + +#ifdef PL_HAVE_DBGHELP + +#include <windows.h> +#include <dbghelp.h> +#include <shlwapi.h> + +// https://github.com/llvm/llvm-project/blob/f03cd763384bbb67ddfa12957859ed58841d4b34/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h#L85-L106 +static inline uintptr_t get_prev_inst_pc(uintptr_t pc) { +#if defined(__arm__) + // T32 (Thumb) branch instructions might be 16 or 32 bit long, + // so we return (pc-2) in that case in order to be safe. + // For A32 mode we return (pc-4) because all instructions are 32 bit long. + return (pc - 3) & (~1); +#elif defined(__x86_64__) || defined(__i386__) + return pc - 1; +#else + return pc - 4; +#endif +} + +static DWORD64 get_preferred_base(const char *module) +{ + DWORD64 image_base = 0; + HANDLE file_mapping = NULL; + HANDLE file_view = NULL; + + HANDLE file = CreateFile(module, GENERIC_READ, FILE_SHARE_READ, NULL, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); + if (file == INVALID_HANDLE_VALUE) + goto done; + + file_mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL); + if (file_mapping == NULL) + goto done; + + file_view = MapViewOfFile(file_mapping, FILE_MAP_READ, 0, 0, 0); + if (file_view == NULL) + goto done; + + PIMAGE_DOS_HEADER dos_header = (PIMAGE_DOS_HEADER) file_view; + if (dos_header->e_magic != IMAGE_DOS_SIGNATURE) + goto done; + + PIMAGE_NT_HEADERS pe_header = (PIMAGE_NT_HEADERS) ((char *) file_view + + dos_header->e_lfanew); + if (pe_header->Signature != IMAGE_NT_SIGNATURE) + goto done; + + if (pe_header->FileHeader.SizeOfOptionalHeader != sizeof(pe_header->OptionalHeader)) + goto done; + + image_base = pe_header->OptionalHeader.ImageBase; + +done: + if (file_view) + UnmapViewOfFile(file_view); + if (file_mapping) + CloseHandle(file_mapping); + if (file != INVALID_HANDLE_VALUE) + CloseHandle(file); + + return image_base; +} + +void pl_log_stack_trace(pl_log log, enum pl_log_level lev) +{ + if (!pl_msg_test(log, lev)) + return; + + void *tmp = pl_tmp(NULL); + PL_ARRAY(void *) frames = {0}; + + size_t capacity = 16; + do { + capacity *= 2; + PL_ARRAY_RESIZE(tmp, frames, capacity); + // Skip first frame, we don't care about this function + frames.num = CaptureStackBackTrace(1, capacity, frames.elem, NULL); + } while (capacity == frames.num); + + if (!frames.num) { + pl_free(tmp); + return; + } + + // Load dbghelp on demand. While it is available on all Windows versions, + // no need to keep it loaded all the time as stack trace printing function, + // in theory should be used repetitively rarely. + HANDLE process = GetCurrentProcess(); + HMODULE dbghelp = LoadLibrary("dbghelp.dll"); + DWORD options; + SYMBOL_INFO *symbol = NULL; + BOOL use_dbghelp = !!dbghelp; + +#define DBGHELP_SYM(sym) \ + __typeof__(&sym) p##sym = (__typeof__(&sym))(void *) GetProcAddress(dbghelp, #sym); \ + use_dbghelp &= !!p##sym + + DBGHELP_SYM(SymCleanup); + DBGHELP_SYM(SymFromAddr); + DBGHELP_SYM(SymGetLineFromAddr64); + DBGHELP_SYM(SymGetModuleInfo64); + DBGHELP_SYM(SymGetOptions); + DBGHELP_SYM(SymGetSearchPathW); + DBGHELP_SYM(SymInitialize); + DBGHELP_SYM(SymSetOptions); + DBGHELP_SYM(SymSetSearchPathW); + +#undef DBGHELP_SYM + + struct priv *p = PL_PRIV(log); + PL_ARRAY(wchar_t) base_search = { .num = 1024 }; + + if (use_dbghelp) { + // DbgHelp is not thread-safe. Note that on Windows mutex is recursive, + // so no need to unlock before calling pl_msg. + pl_mutex_lock(&p->lock); + + options = pSymGetOptions(); + pSymSetOptions(SYMOPT_UNDNAME | SYMOPT_DEFERRED_LOADS | + SYMOPT_LOAD_LINES | SYMOPT_FAVOR_COMPRESSED); + use_dbghelp &= pSymInitialize(process, NULL, TRUE); + + if (use_dbghelp) { + symbol = pl_alloc(tmp, sizeof(SYMBOL_INFO) + 512); + symbol->SizeOfStruct = sizeof(SYMBOL_INFO); + symbol->MaxNameLen = 512; + + PL_ARRAY_RESIZE(tmp, base_search, base_search.num); + BOOL ret = pSymGetSearchPathW(process, base_search.elem, + base_search.num); + base_search.num = ret ? wcslen(base_search.elem) : 0; + PL_ARRAY_APPEND(tmp, base_search, L'\0'); + } else { + pSymSetOptions(options); + pl_mutex_unlock(&p->lock); + } + } + + pl_msg(log, lev, " Backtrace:"); + for (int n = 0; n < frames.num; n++) { + uintptr_t pc = get_prev_inst_pc((uintptr_t) frames.elem[n]); + pl_str out = {0}; + pl_str_append_asprintf(tmp, &out, " #%-2d 0x%"PRIxPTR, n, pc); + + MEMORY_BASIC_INFORMATION meminfo = {0}; + char module_path[MAX_PATH] = {0}; + if (VirtualQuery((LPCVOID) pc, &meminfo, sizeof(meminfo))) { + DWORD sz = GetModuleFileNameA(meminfo.AllocationBase, module_path, + sizeof(module_path)); + if (sz == sizeof(module_path)) + pl_msg(log, PL_LOG_ERR, "module path truncated"); + + if (use_dbghelp) { + // According to documentation it should search in "The directory + // that contains the corresponding module.", but it doesn't appear + // to work, so manually set the path to module path. + // https://learn.microsoft.com/windows/win32/debug/symbol-paths + PL_ARRAY(wchar_t) mod_search = { .num = MAX_PATH }; + PL_ARRAY_RESIZE(tmp, mod_search, mod_search.num); + + sz = GetModuleFileNameW(meminfo.AllocationBase, + mod_search.elem, mod_search.num); + + if (sz > 0 && sz != MAX_PATH && + // TODO: Replace with PathCchRemoveFileSpec once mingw-w64 + // >= 8.0.1 is commonly available, at the time of writing + // there are a few high profile Linux distributions that ship + // 8.0.0. + PathRemoveFileSpecW(mod_search.elem)) + { + mod_search.num = wcslen(mod_search.elem); + PL_ARRAY_APPEND(tmp, mod_search, L';'); + PL_ARRAY_CONCAT(tmp, mod_search, base_search); + pSymSetSearchPathW(process, mod_search.elem); + } + } + } + + DWORD64 sym_displacement; + if (use_dbghelp && pSymFromAddr(process, pc, &sym_displacement, symbol)) + pl_str_append_asprintf(tmp, &out, " in %s+0x%llx", + symbol->Name, sym_displacement); + + DWORD line_displacement; + IMAGEHLP_LINE64 line = {sizeof(line)}; + if (use_dbghelp && + pSymGetLineFromAddr64(process, pc, &line_displacement, &line)) + { + pl_str_append_asprintf(tmp, &out, " %s:%lu+0x%lx", line.FileName, + line.LineNumber, line_displacement); + goto done; + } + + // LLVM tools by convention use absolute addresses with "prefered" base + // image offset. We need to read this offset from binary, because due to + // ASLR we are not loaded at this base. While Windows tools like WinDbg + // expect relative offset to image base. So to be able to easily use it + // with both worlds, print both values. + DWORD64 module_base = get_preferred_base(module_path); + pl_str_append_asprintf(tmp, &out, " (%s+0x%"PRIxPTR") (0x%llx)", module_path, + pc - (uintptr_t) meminfo.AllocationBase, + module_base + (pc - (uintptr_t) meminfo.AllocationBase)); + +done: + pl_msg(log, lev, "%s", out.buf); + } + + if (use_dbghelp) { + pSymSetOptions(options); + pSymCleanup(process); + pl_mutex_unlock(&p->lock); + } + // Unload dbghelp. Maybe it is better to keep it loaded? + if (dbghelp) + FreeLibrary(dbghelp); + pl_free(tmp); +} + +#elif defined(PL_HAVE_UNWIND) +#define UNW_LOCAL_ONLY +#include <libunwind.h> +#include <dlfcn.h> + +void pl_log_stack_trace(pl_log log, enum pl_log_level lev) +{ + if (!pl_msg_test(log, lev)) + return; + + unw_cursor_t cursor; + unw_context_t uc; + unw_word_t ip, off; + unw_getcontext(&uc); + unw_init_local(&cursor, &uc); + + int depth = 0; + pl_msg(log, lev, " Backtrace:"); + while (unw_step(&cursor) > 0) { + char symbol[256] = "<unknown>"; + Dl_info info = { + .dli_fname = "<unknown>", + }; + + unw_get_reg(&cursor, UNW_REG_IP, &ip); + unw_get_proc_name(&cursor, symbol, sizeof(symbol), &off); + dladdr((void *) (uintptr_t) ip, &info); + pl_msg(log, lev, " #%-2d 0x%016" PRIxPTR " in %s+0x%" PRIxPTR" at %s+0x%" PRIxPTR, + depth++, ip, symbol, off, info.dli_fname, ip - (uintptr_t) info.dli_fbase); + } +} + +#elif defined(PL_HAVE_EXECINFO) +#include <execinfo.h> + +void pl_log_stack_trace(pl_log log, enum pl_log_level lev) +{ + if (!pl_msg_test(log, lev)) + return; + + PL_ARRAY(void *) buf = {0}; + size_t buf_avail = 16; + do { + buf_avail *= 2; + PL_ARRAY_RESIZE(NULL, buf, buf_avail); + buf.num = backtrace(buf.elem, buf_avail); + } while (buf.num == buf_avail); + + pl_msg(log, lev, " Backtrace:"); + char **strings = backtrace_symbols(buf.elem, buf.num); + for (int i = 1; i < buf.num; i++) + pl_msg(log, lev, " #%-2d %s", i - 1, strings[i]); + + free(strings); + pl_free(buf.elem); +} + +#else +void pl_log_stack_trace(pl_log log, enum pl_log_level lev) { } +#endif diff --git a/src/log.h b/src/log.h new file mode 100644 index 0000000..dcf8d28 --- /dev/null +++ b/src/log.h @@ -0,0 +1,84 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdarg.h> + +#include "common.h" + +#include <libplacebo/log.h> + +// Internal logging-related functions + +// Warning: Not entirely thread-safe. Exercise caution when using. May result +// in either false positives or false negatives. Make sure to re-run this +// function while `lock` is held, to ensure no race conditions on the check. +static inline bool pl_msg_test(pl_log log, enum pl_log_level lev) +{ + return log && log->params.log_cb && log->params.log_level >= lev; +} + +void pl_msg(pl_log log, enum pl_log_level lev, const char *fmt, ...) + PL_PRINTF(3, 4); + +// Convenience macros +#define pl_fatal(log, ...) pl_msg(log, PL_LOG_FATAL, __VA_ARGS__) +#define pl_err(log, ...) pl_msg(log, PL_LOG_ERR, __VA_ARGS__) +#define pl_warn(log, ...) pl_msg(log, PL_LOG_WARN, __VA_ARGS__) +#define pl_info(log, ...) pl_msg(log, PL_LOG_INFO, __VA_ARGS__) +#define pl_debug(log, ...) pl_msg(log, PL_LOG_DEBUG, __VA_ARGS__) +#define pl_trace(log, ...) pl_msg(log, PL_LOG_TRACE, __VA_ARGS__) + +#define PL_MSG(obj, lev, ...) pl_msg((obj)->log, lev, __VA_ARGS__) + +#define PL_FATAL(obj, ...) PL_MSG(obj, PL_LOG_FATAL, __VA_ARGS__) +#define PL_ERR(obj, ...) PL_MSG(obj, PL_LOG_ERR, __VA_ARGS__) +#define PL_WARN(obj, ...) PL_MSG(obj, PL_LOG_WARN, __VA_ARGS__) +#define PL_INFO(obj, ...) PL_MSG(obj, PL_LOG_INFO, __VA_ARGS__) +#define PL_DEBUG(obj, ...) PL_MSG(obj, PL_LOG_DEBUG, __VA_ARGS__) +#define PL_TRACE(obj, ...) PL_MSG(obj, PL_LOG_TRACE, __VA_ARGS__) + +// Log something with line numbers included +void pl_msg_source(pl_log log, enum pl_log_level lev, const char *src); + +// Temporarily cap the log level to a certain verbosity. This is intended for +// things like probing formats, attempting to create buffers that may fail, and +// other types of operations in which we want to suppress errors. Call with +// PL_LOG_NONE to disable this cap. +// +// Warning: This is generally not thread-safe, and only provided as a temporary +// hack until a better solution can be thought of. +void pl_log_level_cap(pl_log log, enum pl_log_level cap); + +// CPU execution time reporting helper +static inline void pl_log_cpu_time(pl_log log, pl_clock_t start, pl_clock_t stop, + const char *operation) +{ + double ms = pl_clock_diff(stop, start) * 1e3; + enum pl_log_level lev = PL_LOG_DEBUG; + if (ms > 10) + lev = PL_LOG_INFO; + if (ms > 1000) + lev = PL_LOG_WARN; + + pl_msg(log, lev, "Spent %.3f ms %s%s", ms, operation, + ms > 100 ? " (slow!)" : ""); +} + +// Log stack trace +PL_NOINLINE void pl_log_stack_trace(pl_log log, enum pl_log_level lev); diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 0000000..63f9d53 --- /dev/null +++ b/src/meson.build @@ -0,0 +1,347 @@ +### Common dependencies +unwind = dependency('libunwind', required: get_option('unwind')) +libexecinfo = cc.find_library('execinfo', required: false) +has_execinfo = cc.has_function('backtrace_symbols', dependencies: libexecinfo, prefix: '#include <execinfo.h>') +dbghelp = cc.check_header('dbghelp.h', prefix: '#include <windows.h>') +conf_internal.set('PL_HAVE_DBGHELP', dbghelp) +conf_internal.set('PL_HAVE_UNWIND', unwind.found()) +conf_internal.set('PL_HAVE_EXECINFO', has_execinfo) +if dbghelp + build_deps += cc.find_library('shlwapi', required: true) +elif unwind.found() + build_deps += [unwind, cc.find_library('dl', required : false)] +elif has_execinfo + build_deps += libexecinfo +endif + +link_args = [] +link_depends = [] + +# Looks like meson in certain configuration returns ' ' instead of empty string +mingw32 = cc.get_define('__MINGW32__').strip() +if host_machine.system() == 'windows' and mingw32 != '' and host_machine.cpu() in ['aarch64', 'arm', 'x86_64'] + # MinGW-w64 math functions are significantly slower than the UCRT ones. + # In particular powf is over 7 times slower than UCRT counterpart. + # MinGW-w64 explicitly excludes some math functions from their ucrtbase def + # file and replaces with own versions. To workaround the issue, generate the + # import library and link it with UCRT versions of math functions. + dlltool = find_program('llvm-dlltool', 'dlltool') + ucrt_math = custom_target('ucrt_math.lib', + output : ['ucrt_math.lib'], + input : 'ucrt_math.def', + command : [dlltool, '-d', '@INPUT@', '-l', '@OUTPUT@']) + link_args += ucrt_math.full_path() + link_depends += ucrt_math + # MinGW-w64 inlines functions like powf, rewriting them to pow. We want to use + # the powf specialization from UCRT, so disable inlining. + add_project_arguments(['-D__CRT__NO_INLINE'], language: ['c', 'cpp']) +endif + +# Work around missing atomics on some (obscure) platforms +atomic_test = ''' +#include <stdatomic.h> +#include <stdint.h> +int main(void) { + _Atomic uint32_t x32; + atomic_init(&x32, 0); +}''' + +if not cc.links(atomic_test) + build_deps += cc.find_library('atomic') +endif + + +### Common source files +headers = [ + 'cache.h', + 'colorspace.h', + 'common.h', + 'd3d11.h', + 'dispatch.h', + 'dither.h', + 'dummy.h', + 'filters.h', + 'gamut_mapping.h', + 'gpu.h', + 'log.h', + 'opengl.h', + 'options.h', + 'renderer.h', + 'shaders/colorspace.h', + 'shaders/custom.h', + 'shaders/deinterlacing.h', + 'shaders/dithering.h', + 'shaders/film_grain.h', + 'shaders/icc.h', + 'shaders/lut.h', + 'shaders/sampling.h', + 'shaders.h', + 'swapchain.h', + 'tone_mapping.h', + 'utils/dav1d.h', + 'utils/dav1d_internal.h', + 'utils/dolbyvision.h', + 'utils/frame_queue.h', + 'utils/libav.h', + 'utils/libav_internal.h', + 'utils/upload.h', + 'vulkan.h', +] + +sources = [ + 'cache.c', + 'colorspace.c', + 'common.c', + 'convert.cc', + 'dither.c', + 'dispatch.c', + 'dummy.c', + 'filters.c', + 'format.c', + 'gamut_mapping.c', + 'glsl/spirv.c', + 'gpu.c', + 'gpu/utils.c', + 'log.c', + 'options.c', + 'pl_alloc.c', + 'pl_string.c', + 'swapchain.c', + 'tone_mapping.c', + 'utils/dolbyvision.c', + 'utils/frame_queue.c', + 'utils/upload.c', +] + +# Source files that may use GLSL pragmas, we need to use custom_target +# to the proper environment and dependency information for these +foreach f : ['renderer.c', 'shaders.c'] + sources += custom_target(f, + command: glsl_preproc, + depend_files: glsl_deps, + env: python_env, + input: f, + output: f, + ) +endforeach + +# More .c files defined here, we can't put them in this file because of meson +# preventing the use of / in custom_target output filenames +subdir('shaders') + +tests = [ + 'cache.c', + 'colorspace.c', + 'common.c', + 'dither.c', + 'dummy.c', + 'lut.c', + 'filters.c', + 'options.c', + 'string.c', + 'tone_mapping.c', + 'utils.c', +] + +fuzzers = [ + 'lut.c', + 'options.c', + 'shaders.c', + 'user_shaders.c', +] + +components = configuration_data() + + +### Optional dependencies / components +subdir('glsl') +subdir('d3d11') +subdir('opengl') +subdir('vulkan') + +lcms = dependency('lcms2', version: '>=2.9', required: get_option('lcms')) +components.set('lcms', lcms.found()) +if lcms.found() + build_deps += lcms + tests += 'icc.c' +endif + +# Check to see if libplacebo built this way is sane +if not (components.get('vulkan') or components.get('opengl') or components.get('d3d11')) + warning('Building without any graphics API. libplacebo built this way still ' + + 'has some limited use (e.g. generating GLSL shaders), but most of ' + + 'its functionality will be missing or impaired!') +endif + +has_spirv = components.get('shaderc') or components.get('glslang') +needs_spirv = components.get('vulkan') or components.get('d3d11') +if needs_spirv and not has_spirv + warning('Building without any GLSL compiler (shaderc, glslang), but with ' + + 'APIs required that require one (vulkan, d3d11). This build is very ' + + 'likely to be very limited in functionality!') +endif + +dovi = get_option('dovi') +components.set('dovi', dovi.allowed()) + +libdovi = dependency('dovi', version: '>=1.6.7', required: get_option('libdovi').require(dovi.allowed())) +components.set('libdovi', libdovi.found()) +if libdovi.found() + build_deps += libdovi +endif + +xxhash_inc = include_directories() +xxhash = dependency('libxxhash', required: get_option('xxhash')) +components.set('xxhash', xxhash.found()) +if xxhash.found() + xxhash_inc = xxhash.get_variable('includedir') +endif + +# Generate configuration files +defs = '' +pc_vars = [] + +foreach comp : components.keys() + found = components.get(comp) + varname = comp.underscorify().to_upper() + summary(comp, found, section: 'Optional features', bool_yn: true) + defs += (found ? '#define PL_HAVE_@0@ 1\n' : '#undef PL_HAVE_@0@\n').format(varname) + pc_vars += 'pl_has_@0@=@1@'.format(varname.to_lower(), found ? 1 : 0) +endforeach + +conf_public.set('extra_defs', defs) +subdir('./include/libplacebo') # generate config.h in the right location +sources += configure_file( + output: 'config_internal.h', + configuration: conf_internal +) + +version_h = vcs_tag( + command: ['git', 'describe', '--dirty'], + fallback: version_pretty, + replace_string: '@buildver@', + input: 'version.h.in', + output: 'version.h', +) + +sources += version_h + +if host_machine.system() == 'windows' + windows = import('windows') + sources += windows.compile_resources(libplacebo_rc, depends: version_h, + include_directories: meson.project_source_root()/'win32') +endif + +fast_float_inc = include_directories() +if fs.is_dir('../3rdparty/fast_float/include') + fast_float_inc = include_directories('../3rdparty/fast_float/include') +endif + +### Main library build process +inc = include_directories('./include') +lib = library('placebo', sources, + c_args: ['-DPL_EXPORT'], + install: true, + dependencies: build_deps + glad_dep, + soversion: apiver, + include_directories: [ inc, vulkan_headers_inc, fast_float_inc, xxhash_inc ], + link_args: link_args, + link_depends: link_depends, + gnu_symbol_visibility: 'hidden', + name_prefix: 'lib' +) + +libplacebo = declare_dependency( + include_directories: inc, + compile_args: get_option('default_library') == 'static' ? ['-DPL_STATIC'] : [], + link_with: lib, + variables: pc_vars, +) + + +### Install process +proj_name = meson.project_name() +foreach h : headers + parts = h.split('/') + path = proj_name + foreach p : parts + if p != parts[-1] + path = path / p + endif + endforeach + + install_headers('include' / proj_name / h, subdir: path) +endforeach + +extra_cflags = [] +if get_option('default_library') == 'static' + extra_cflags = ['-DPL_STATIC'] +elif get_option('default_library') == 'both' + # meson doesn't support Cflags.private, insert it forcefully... + extra_cflags = ['\nCflags.private:', '-DPL_STATIC'] +endif + +pkg = import('pkgconfig') +pkg.generate( + name: proj_name, + description: 'Reusable library for GPU-accelerated video/image rendering', + libraries: lib, + version: version, + variables: pc_vars, + extra_cflags: extra_cflags, +) + + +### Testing +tdep_static = declare_dependency( + dependencies: build_deps, + include_directories: [ inc, include_directories('.') ], + compile_args: '-DPL_STATIC' + # TODO: Define objects here once Meson 1.1.0 is ok to use + # objects: lib.extract_all_objects(recursive: false) + ) + +tdep_shared = declare_dependency( + include_directories: [ inc, include_directories('.') ], + compile_args: get_option('default_library') == 'static' ? ['-DPL_STATIC'] : [], + link_with: lib, + ) + +if get_option('tests') + subdir('tests') +endif + +if get_option('bench') + if not components.get('vk-proc-addr') + error('Compiling the benchmark suite requires vulkan support!') + endif + + bench = executable('bench', + 'tests/bench.c', + dependencies: [tdep_shared, vulkan_headers], + link_args: link_args, + link_depends: link_depends, + include_directories: vulkan_headers_inc, + ) + test('benchmark', bench, is_parallel: false, timeout: 600) +endif + +if get_option('fuzz') + foreach f : fuzzers + executable('fuzz.' + f, 'tests/fuzz/' + f, + objects: lib.extract_all_objects(recursive: false), + dependencies: tdep_static, + link_args: link_args, + link_depends: link_depends, + ) + endforeach +endif + +pl_thread = declare_dependency( + include_directories: include_directories('.'), + dependencies: threads, +) + +pl_clock = declare_dependency( + include_directories: include_directories('.'), +) diff --git a/src/opengl/common.h b/src/opengl/common.h new file mode 100644 index 0000000..c84c69f --- /dev/null +++ b/src/opengl/common.h @@ -0,0 +1,66 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "../common.h" +#include "../log.h" +#include "../gpu.h" +#include "pl_thread.h" + +#include <libplacebo/opengl.h> + +// Collision with llvm-mingw <winnt.h> +#undef MemoryBarrier + +#define GLAD_GL +#define GLAD_GLES2 +#include <glad/gl.h> +#include <glad/egl.h> + +typedef GladGLContext gl_funcs; + +// PL_PRIV(pl_opengl) +struct gl_ctx { + pl_log log; + struct pl_opengl_params params; + bool is_debug; + bool is_debug_egl; + bool is_gles; + + // For context locking + pl_mutex lock; + int count; + + // Dispatch table + gl_funcs func; +}; + +struct gl_cb { + void (*callback)(void *priv); + void *priv; + GLsync sync; +}; + +struct fbo_format { + pl_fmt fmt; + const struct gl_format *glfmt; +}; + +// For locking/unlocking +bool gl_make_current(pl_opengl gl); +void gl_release_current(pl_opengl gl); diff --git a/src/opengl/context.c b/src/opengl/context.c new file mode 100644 index 0000000..6ca14b8 --- /dev/null +++ b/src/opengl/context.c @@ -0,0 +1,332 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <ctype.h> + +#include "common.h" +#include "utils.h" +#include "gpu.h" + +const struct pl_opengl_params pl_opengl_default_params = {0}; + +static void GLAPIENTRY debug_cb(GLenum source, GLenum type, GLuint id, + GLenum severity, GLsizei length, + const GLchar *message, const void *userParam) +{ + pl_log log = (void *) userParam; + enum pl_log_level level = PL_LOG_ERR; + + switch (severity) { + case GL_DEBUG_SEVERITY_NOTIFICATION:level = PL_LOG_DEBUG; break; + case GL_DEBUG_SEVERITY_LOW: level = PL_LOG_INFO; break; + case GL_DEBUG_SEVERITY_MEDIUM: level = PL_LOG_WARN; break; + case GL_DEBUG_SEVERITY_HIGH: level = PL_LOG_ERR; break; + } + + pl_msg(log, level, "GL: %s", message); + + if (level <= PL_LOG_ERR) + pl_log_stack_trace(log, level); +} + +static void GLAPIENTRY debug_cb_egl(EGLenum error, const char *command, + EGLint messageType, EGLLabelKHR threadLabel, + EGLLabelKHR objectLabel, const char *message) +{ + pl_log log = threadLabel; + enum pl_log_level level = PL_LOG_ERR; + + switch (messageType) { + case EGL_DEBUG_MSG_CRITICAL_KHR: level = PL_LOG_FATAL; break; + case EGL_DEBUG_MSG_ERROR_KHR: level = PL_LOG_ERR; break; + case EGL_DEBUG_MSG_WARN_KHR: level = PL_LOG_WARN; break; + case EGL_DEBUG_MSG_INFO_KHR: level = PL_LOG_DEBUG; break; + } + + pl_msg(log, level, "EGL: %s: %s %s", command, egl_err_str(error), + message); + + if (level <= PL_LOG_ERR) + pl_log_stack_trace(log, level); +} + +// Guards access to the (thread-unsafe) glad global EGL state +static pl_static_mutex glad_egl_mutex = PL_STATIC_MUTEX_INITIALIZER; + +void pl_opengl_destroy(pl_opengl *ptr) +{ + pl_opengl pl_gl = *ptr; + if (!pl_gl) + return; + + struct gl_ctx *p = PL_PRIV(pl_gl); + gl_funcs *gl = &p->func; + if (!gl_make_current(pl_gl)) { + PL_WARN(p, "Failed uninitializing OpenGL context, leaking resources!"); + return; + } + + if (p->is_debug) + gl->DebugMessageCallback(NULL, NULL); + + if (p->is_debug_egl) + eglDebugMessageControlKHR(NULL, NULL); + + pl_gpu_destroy(pl_gl->gpu); + +#ifdef PL_HAVE_GL_PROC_ADDR + if (p->is_gles) { + gladLoaderUnloadGLES2Context(gl); + } else { + gladLoaderUnloadGLContext(gl); + } + + bool used_loader = !p->params.get_proc_addr && !p->params.get_proc_addr_ex; + if (p->params.egl_display && used_loader) { + pl_static_mutex_lock(&glad_egl_mutex); + gladLoaderUnloadEGL(); + pl_static_mutex_unlock(&glad_egl_mutex); + } +#endif + + gl_release_current(pl_gl); + pl_mutex_destroy(&p->lock); + pl_free_ptr((void **) ptr); + +} + +typedef PL_ARRAY(const char *) ext_arr_t; +static void add_exts_str(void *alloc, ext_arr_t *arr, const char *extstr) +{ + pl_str rest = pl_str_strip(pl_str0(pl_strdup0(alloc, pl_str0(extstr)))); + while (rest.len) { + pl_str ext = pl_str_split_char(rest, ' ', &rest); + ext.buf[ext.len] = '\0'; // re-use separator for terminator + PL_ARRAY_APPEND(alloc, *arr, (char *) ext.buf); + } +} + +pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params) +{ + params = PL_DEF(params, &pl_opengl_default_params); + struct pl_opengl_t *pl_gl = pl_zalloc_obj(NULL, pl_gl, struct gl_ctx); + struct gl_ctx *p = PL_PRIV(pl_gl); + gl_funcs *gl = &p->func; + p->params = *params; + p->log = log; + + pl_mutex_init_type(&p->lock, PL_MUTEX_RECURSIVE); + if (!gl_make_current(pl_gl)) { + pl_free(pl_gl); + return NULL; + } + + bool ok; + if (params->get_proc_addr_ex) { + ok = gladLoadGLContextUserPtr(gl, params->get_proc_addr_ex, params->proc_ctx); + } else if (params->get_proc_addr) { + ok = gladLoadGLContext(gl, params->get_proc_addr); + } else { +#ifdef PL_HAVE_GL_PROC_ADDR + ok = gladLoaderLoadGLContext(gl); +#else + PL_FATAL(p, "No `glGetProcAddress` function provided, and libplacebo " + "built without its built-in OpenGL loader!"); + goto error; +#endif + } + + if (!ok) { + PL_INFO(p, "Failed loading core GL, retrying as GLES..."); + } else if (gl_is_gles(pl_gl)) { + PL_INFO(p, "GL context seems to be OpenGL ES, reloading as GLES..."); + ok = false; + } + + if (!ok) { + memset(gl, 0, sizeof(*gl)); + if (params->get_proc_addr_ex) { + ok = gladLoadGLES2ContextUserPtr(gl, params->get_proc_addr_ex, params->proc_ctx); + } else if (params->get_proc_addr) { + ok = gladLoadGLES2Context(gl, params->get_proc_addr); + } else { +#ifdef PL_HAVE_GL_PROC_ADDR + ok = gladLoaderLoadGLES2Context(gl); +#else + pl_unreachable(); +#endif + } + p->is_gles = ok; + } + + if (!ok) { + PL_FATAL(p, "Failed to initialize OpenGL context - make sure a valid " + "OpenGL context is bound to the current thread!"); + goto error; + } + + const char *version = (const char *) gl->GetString(GL_VERSION); + if (version) { + const char *ver = version; + while (!isdigit(*ver) && *ver != '\0') + ver++; + if (sscanf(ver, "%d.%d", &pl_gl->major, &pl_gl->minor) != 2) { + PL_FATAL(p, "Invalid GL_VERSION string: %s\n", version); + goto error; + } + } + + if (!pl_gl->major) { + PL_FATAL(p, "No OpenGL version detected - make sure an OpenGL context " + "is bound to the current thread!"); + goto error; + } + + static const int gl_ver_req = 3; + if (pl_gl->major < gl_ver_req) { + PL_FATAL(p, "OpenGL version too old (%d < %d), please use a newer " + "OpenGL implementation or downgrade libplacebo!", + pl_gl->major, gl_ver_req); + goto error; + } + + PL_INFO(p, "Detected OpenGL version strings:"); + PL_INFO(p, " GL_VERSION: %s", version); + PL_INFO(p, " GL_VENDOR: %s", (char *) gl->GetString(GL_VENDOR)); + PL_INFO(p, " GL_RENDERER: %s", (char *) gl->GetString(GL_RENDERER)); + + ext_arr_t exts = {0}; + if (pl_gl->major >= 3) { + gl->GetIntegerv(GL_NUM_EXTENSIONS, &exts.num); + PL_ARRAY_RESIZE(pl_gl, exts, exts.num); + for (int i = 0; i < exts.num; i++) + exts.elem[i] = (const char *) gl->GetStringi(GL_EXTENSIONS, i); + } else { + add_exts_str(pl_gl, &exts, (const char *) gl->GetString(GL_EXTENSIONS)); + } + + if (pl_msg_test(log, PL_LOG_DEBUG)) { + PL_DEBUG(p, " GL_EXTENSIONS:"); + for (int i = 0; i < exts.num; i++) + PL_DEBUG(p, " %s", exts.elem[i]); + } + + if (params->egl_display) { + pl_static_mutex_lock(&glad_egl_mutex); + if (params->get_proc_addr_ex) { + ok = gladLoadEGLUserPtr(params->egl_display, params->get_proc_addr_ex, + params->proc_ctx); + } else if (params->get_proc_addr) { + ok = gladLoadEGL(params->egl_display, params->get_proc_addr); + } else { +#ifdef PL_HAVE_GL_PROC_ADDR + ok = gladLoaderLoadEGL(params->egl_display); +#else + pl_unreachable(); +#endif + } + pl_static_mutex_unlock(&glad_egl_mutex); + + if (!ok) { + PL_FATAL(p, "Failed loading EGL functions - double check EGLDisplay?"); + goto error; + } + + int start = exts.num; + add_exts_str(pl_gl, &exts, eglQueryString(params->egl_display, + EGL_EXTENSIONS)); + if (exts.num > start) { + PL_DEBUG(p, " EGL_EXTENSIONS:"); + for (int i = start; i < exts.num; i++) + PL_DEBUG(p, " %s", exts.elem[i]); + } + } + + pl_gl->extensions = exts.elem; + pl_gl->num_extensions = exts.num; + + if (!params->allow_software && gl_is_software(pl_gl)) { + PL_FATAL(p, "OpenGL context is suspected to be a software rasterizer, " + "but `allow_software` is false."); + goto error; + } + + if (params->debug) { + if (pl_opengl_has_ext(pl_gl, "GL_KHR_debug")) { + gl->DebugMessageCallback(debug_cb, log); + gl->Enable(GL_DEBUG_OUTPUT); + p->is_debug = true; + } else { + PL_WARN(p, "OpenGL debugging requested, but GL_KHR_debug is not " + "available... ignoring!"); + } + + if (params->egl_display && pl_opengl_has_ext(pl_gl, "EGL_KHR_debug")) { + static const EGLAttrib attribs[] = { + // Enable everything under the sun, because the `pl_ctx` log + // level may change at runtime. + EGL_DEBUG_MSG_CRITICAL_KHR, EGL_TRUE, + EGL_DEBUG_MSG_ERROR_KHR, EGL_TRUE, + EGL_DEBUG_MSG_WARN_KHR, EGL_TRUE, + EGL_DEBUG_MSG_INFO_KHR, EGL_TRUE, + EGL_NONE, + }; + + eglDebugMessageControlKHR(debug_cb_egl, attribs); + eglLabelObjectKHR(NULL, EGL_OBJECT_THREAD_KHR, NULL, (void *) log); + p->is_debug_egl = true; + } + } + + pl_gl->gpu = pl_gpu_create_gl(log, pl_gl, params); + if (!pl_gl->gpu) + goto error; + + gl_release_current(pl_gl); + return pl_gl; + +error: + PL_FATAL(p, "Failed initializing opengl context!"); + gl_release_current(pl_gl); + pl_opengl_destroy((pl_opengl *) &pl_gl); + return NULL; +} + +bool gl_make_current(pl_opengl pl_gl) +{ + struct gl_ctx *p = PL_PRIV(pl_gl); + pl_mutex_lock(&p->lock); + if (!p->count && p->params.make_current) { + if (!p->params.make_current(p->params.priv)) { + PL_ERR(p, "Failed making OpenGL context current on calling thread!"); + pl_mutex_unlock(&p->lock); + return false; + } + } + + p->count++; + return true; +} + +void gl_release_current(pl_opengl pl_gl) +{ + struct gl_ctx *p = PL_PRIV(pl_gl); + p->count--; + if (!p->count && p->params.release_current) + p->params.release_current(p->params.priv); + pl_mutex_unlock(&p->lock); +} diff --git a/src/opengl/formats.c b/src/opengl/formats.c new file mode 100644 index 0000000..6604835 --- /dev/null +++ b/src/opengl/formats.c @@ -0,0 +1,485 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "common.h" +#include "formats.h" +#include "utils.h" + +#ifdef PL_HAVE_UNIX +static bool supported_fourcc(struct pl_gl *p, EGLint fourcc) +{ + for (int i = 0; i < p->egl_formats.num; ++i) + if (fourcc == p->egl_formats.elem[i]) + return true; + return false; +} +#endif + +#define FMT(_name, bits, ftype, _caps) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_##ftype, \ + .caps = (enum pl_fmt_caps) (_caps), \ + .sample_order = {0, 1, 2, 3}, \ + .component_depth = {bits, bits, bits, bits}, \ + } + +// Convenience to make the names simpler +enum { + // Type aliases + U8 = GL_UNSIGNED_BYTE, + U16 = GL_UNSIGNED_SHORT, + U32 = GL_UNSIGNED_INT, + I8 = GL_BYTE, + I16 = GL_SHORT, + I32 = GL_INT, + FLT = GL_FLOAT, + + // Component aliases + R = GL_RED, + RG = GL_RG, + RGB = GL_RGB, + RGBA = GL_RGBA, + BGRA = GL_BGRA, + RI = GL_RED_INTEGER, + RGI = GL_RG_INTEGER, + RGBI = GL_RGB_INTEGER, + RGBAI = GL_RGBA_INTEGER, + + // Capability aliases + S = PL_FMT_CAP_SAMPLEABLE, + L = PL_FMT_CAP_LINEAR, + F = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE, // FBO support + V = PL_FMT_CAP_VERTEX, +}; + +// Basic 8-bit formats +const struct gl_format formats_norm8[] = { + {GL_R8, R, U8, FMT("r8", 8, UNORM, S|L|F|V)}, + {GL_RG8, RG, U8, FMT("rg8", 8, UNORM, S|L|F|V)}, + {GL_RGB8, RGB, U8, FMT("rgb8", 8, UNORM, S|L|F|V)}, + {GL_RGBA8, RGBA, U8, FMT("rgba8", 8, UNORM, S|L|F|V)}, +}; + +// Signed variants +/* TODO: these are broken in mesa +const struct gl_format formats_snorm8[] = { + {GL_R8_SNORM, R, I8, FMT("r8s", 8, SNORM, S|L|F|V)}, + {GL_RG8_SNORM, RG, I8, FMT("rg8s", 8, SNORM, S|L|F|V)}, + {GL_RGB8_SNORM, RGB, I8, FMT("rgb8s", 8, SNORM, S|L|F|V)}, + {GL_RGBA8_SNORM, RGBA, I8, FMT("rgba8s", 8, SNORM, S|L|F|V)}, +}; +*/ + +// BGRA 8-bit +const struct gl_format formats_bgra8[] = { + {GL_RGBA8, BGRA, U8, { + .name = "bgra8", + .type = PL_FMT_UNORM, + .caps = S|L|F|V, + .sample_order = {2, 1, 0, 3}, + .component_depth = {8, 8, 8, 8}, + }}, +}; + +// Basic 16-bit formats, excluding rgb16 (special cased below) +const struct gl_format formats_norm16[] = { + {GL_R16, R, U16, FMT("r16", 16, UNORM, S|L|F|V)}, + {GL_RG16, RG, U16, FMT("rg16", 16, UNORM, S|L|F|V)}, + {GL_RGBA16, RGBA, U16, FMT("rgba16", 16, UNORM, S|L|F|V)}, +}; + +// Renderable version of rgb16 +const struct gl_format formats_rgb16_fbo[] = { + {GL_RGB16, RGB, U16, FMT("rgb16", 16, UNORM, S|L|F|V)}, +}; + +// Non-renderable version of rgb16 +const struct gl_format formats_rgb16_fallback[] = { + {GL_RGB16, RGB, U16, FMT("rgb16", 16, UNORM, S|L|V)}, +}; + +// Signed 16-bit variants +/* TODO: these are broken in mesa and nvidia +const struct gl_format formats_snorm16[] = { + {GL_R16_SNORM, R, I16, FMT("r16s", 16, SNORM, S|L|F|V)}, + {GL_RG16_SNORM, RG, I16, FMT("rg16s", 16, SNORM, S|L|F|V)}, + {GL_RGB16_SNORM, RGB, I16, FMT("rgb16s", 16, SNORM, S|L|F|V)}, + {GL_RGBA16_SNORM, RGBA, I16, FMT("rgba16s", 16, SNORM, S|L|F|V)}, +}; +*/ + +// Floating point texture formats +const struct gl_format formats_float[] = { + {GL_R16F, R, FLT, FMT("r16f", 16, FLOAT, S|L|F)}, + {GL_RG16F, RG, FLT, FMT("rg16f", 16, FLOAT, S|L|F)}, + {GL_RGB16F, RGB, FLT, FMT("rgb16f", 16, FLOAT, S|L|F)}, + {GL_RGBA16F, RGBA, FLT, FMT("rgba16f", 16, FLOAT, S|L|F)}, + {GL_R32F, R, FLT, FMT("r32f", 32, FLOAT, S|L|F|V)}, + {GL_RG32F, RG, FLT, FMT("rg32f", 32, FLOAT, S|L|F|V)}, + {GL_RGB32F, RGB, FLT, FMT("rgb32f", 32, FLOAT, S|L|F|V)}, + {GL_RGBA32F, RGBA, FLT, FMT("rgba32f", 32, FLOAT, S|L|F|V)}, +}; + +// Renderable 16-bit float formats (excluding rgb16f) +const struct gl_format formats_float16_fbo[] = { + {GL_R16F, R, FLT, FMT("r16f", 16, FLOAT, S|L|F)}, + {GL_RG16F, RG, FLT, FMT("rg16f", 16, FLOAT, S|L|F)}, + {GL_RGB16F, RGB, FLT, FMT("rgb16f", 16, FLOAT, S|L)}, + {GL_RGBA16F, RGBA, FLT, FMT("rgba16f", 16, FLOAT, S|L|F)}, +}; + +// Non-renderable 16-bit float formats +const struct gl_format formats_float16_fallback[] = { + {GL_R16F, R, FLT, FMT("r16f", 16, FLOAT, S|L)}, + {GL_RG16F, RG, FLT, FMT("rg16f", 16, FLOAT, S|L)}, + {GL_RGB16F, RGB, FLT, FMT("rgb16f", 16, FLOAT, S|L)}, + {GL_RGBA16F, RGBA, FLT, FMT("rgba16f", 16, FLOAT, S|L)}, +}; + +// (Unsigned) integer formats +const struct gl_format formats_uint[] = { + {GL_R8UI, RI, U8, FMT("r8u", 8, UINT, S|F|V)}, + {GL_RG8UI, RGI, U8, FMT("rg8u", 8, UINT, S|F|V)}, + {GL_RGB8UI, RGBI, U8, FMT("rgb8u", 8, UINT, S|V)}, + {GL_RGBA8UI, RGBAI, U8, FMT("rgba8u", 8, UINT, S|F|V)}, + {GL_R16UI, RI, U16, FMT("r16u", 16, UINT, S|F|V)}, + {GL_RG16UI, RGI, U16, FMT("rg16u", 16, UINT, S|F|V)}, + {GL_RGB16UI, RGBI, U16, FMT("rgb16u", 16, UINT, S|V)}, + {GL_RGBA16UI, RGBAI, U16, FMT("rgba16u", 16, UINT, S|F|V)}, +}; + +/* TODO + {GL_R32UI, RI, U32, FMT("r32u", 32, UINT)}, + {GL_RG32UI, RGI, U32, FMT("rg32u", 32, UINT)}, + {GL_RGB32UI, RGBI, U32, FMT("rgb32u", 32, UINT)}, + {GL_RGBA32UI, RGBAI, U32, FMT("rgba32u", 32, UINT)}, + + {GL_R8I, RI, I8, FMT("r8i", 8, SINT)}, + {GL_RG8I, RGI, I8, FMT("rg8i", 8, SINT)}, + {GL_RGB8I, RGBI, I8, FMT("rgb8i", 8, SINT)}, + {GL_RGBA8I, RGBAI, I8, FMT("rgba8i", 8, SINT)}, + {GL_R16I, RI, I16, FMT("r16i", 16, SINT)}, + {GL_RG16I, RGI, I16, FMT("rg16i", 16, SINT)}, + {GL_RGB16I, RGBI, I16, FMT("rgb16i", 16, SINT)}, + {GL_RGBA16I, RGBAI, I16, FMT("rgba16i", 16, SINT)}, + {GL_R32I, RI, I32, FMT("r32i", 32, SINT)}, + {GL_RG32I, RGI, I32, FMT("rg32i", 32, SINT)}, + {GL_RGB32I, RGBI, I32, FMT("rgb32i", 32, SINT)}, + {GL_RGBA32I, RGBAI, I32, FMT("rgba32i", 32, SINT)}, +*/ + +// GL2 legacy formats +const struct gl_format formats_legacy_gl2[] = { + {GL_RGB8, RGB, U8, FMT("rgb8", 8, UNORM, S|L|V)}, + {GL_RGBA8, RGBA, U8, FMT("rgba8", 8, UNORM, S|L|V)}, + {GL_RGB16, RGB, U16, FMT("rgb16", 16, UNORM, S|L|V)}, + {GL_RGBA16, RGBA, U16, FMT("rgba16", 16, UNORM, S|L|V)}, +}; + +// GLES2 legacy formats +const struct gl_format formats_legacy_gles2[] = { + {GL_RGB, RGB, U8, FMT("rgb", 8, UNORM, S|L)}, + {GL_RGBA, RGBA, U8, FMT("rgba", 8, UNORM, S|L)}, +}; + +// GLES BGRA +const struct gl_format formats_bgra_gles[] = { + {GL_BGRA, BGRA, U8, { + .name = "bgra8", + .type = PL_FMT_UNORM, + .caps = S|L|F|V, + .sample_order = {2, 1, 0, 3}, + .component_depth = {8, 8, 8, 8}, + }}, +}; + +// Fallback for vertex-only formats, as a last resort +const struct gl_format formats_basic_vertex[] = { + {GL_R32F, R, FLT, FMT("r32f", 32, FLOAT, V)}, + {GL_RG32F, RG, FLT, FMT("rg32f", 32, FLOAT, V)}, + {GL_RGB32F, RGB, FLT, FMT("rgb32f", 32, FLOAT, V)}, + {GL_RGBA32F, RGBA, FLT, FMT("rgba32f", 32, FLOAT, V)}, +}; + +static void add_format(pl_gpu pgpu, const struct gl_format *gl_fmt) +{ + struct pl_gpu_t *gpu = (struct pl_gpu_t *) pgpu; + struct pl_gl *p = PL_PRIV(gpu); + + struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, gl_fmt); + const struct gl_format **fmtp = PL_PRIV(fmt); + *fmt = gl_fmt->tmpl; + *fmtp = gl_fmt; + + // Calculate the host size and number of components + switch (gl_fmt->fmt) { + case GL_RED: + case GL_RED_INTEGER: + fmt->num_components = 1; + break; + case GL_RG: + case GL_RG_INTEGER: + fmt->num_components = 2; + break; + case GL_RGB: + case GL_RGB_INTEGER: + fmt->num_components = 3; + break; + case GL_RGBA: + case GL_RGBA_INTEGER: + case GL_BGRA: + fmt->num_components = 4; + break; + default: + pl_unreachable(); + } + + int size; + switch (gl_fmt->type) { + case GL_BYTE: + case GL_UNSIGNED_BYTE: + size = 1; + break; + case GL_SHORT: + case GL_UNSIGNED_SHORT: + size = 2; + break; + case GL_INT: + case GL_UNSIGNED_INT: + case GL_FLOAT: + size = 4; + break; + default: + pl_unreachable(); + } + + // Host visible representation + fmt->texel_size = fmt->num_components * size; + fmt->texel_align = 1; + for (int i = 0; i < fmt->num_components; i++) + fmt->host_bits[i] = size * 8; + + // Compute internal size by summing up the depth + int ibits = 0; + for (int i = 0; i < fmt->num_components; i++) + ibits += fmt->component_depth[i]; + fmt->internal_size = (ibits + 7) / 8; + + // We're not the ones actually emulating these texture format - the + // driver is - but we might as well set the hint. + fmt->emulated = fmt->texel_size != fmt->internal_size; + + // 3-component formats are almost surely also emulated + if (fmt->num_components == 3) + fmt->emulated = true; + + // Older OpenGL most likely emulates 32-bit float formats as well + if (p->gl_ver < 30 && fmt->component_depth[0] >= 32) + fmt->emulated = true; + + // For sanity, clear the superfluous fields + for (int i = fmt->num_components; i < 4; i++) { + fmt->component_depth[i] = 0; + fmt->sample_order[i] = 0; + fmt->host_bits[i] = 0; + } + + fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, "")); + fmt->glsl_format = pl_fmt_glsl_format(fmt, fmt->num_components); + fmt->fourcc = pl_fmt_fourcc(fmt); + pl_assert(fmt->glsl_type); + +#ifdef PL_HAVE_UNIX + if (p->has_modifiers && fmt->fourcc && supported_fourcc(p, fmt->fourcc)) { + int num_mods = 0; + bool ok = eglQueryDmaBufModifiersEXT(p->egl_dpy, fmt->fourcc, + 0, NULL, NULL, &num_mods); + if (ok && num_mods) { + // On my system eglQueryDmaBufModifiersEXT seems to never return + // MOD_INVALID even though eglExportDMABUFImageQueryMESA happily + // returns such modifiers. Since we handle INVALID by not + // requiring modifiers at all, always add this value to the + // list of supported modifiers. May result in duplicates, but + // whatever. + uint64_t *mods = pl_calloc(fmt, num_mods + 1, sizeof(uint64_t)); + mods[0] = DRM_FORMAT_MOD_INVALID; + ok = eglQueryDmaBufModifiersEXT(p->egl_dpy, fmt->fourcc, num_mods, + &mods[1], NULL, &num_mods); + + if (ok) { + fmt->modifiers = mods; + fmt->num_modifiers = num_mods + 1; + } else { + pl_free(mods); + } + } + + eglGetError(); // ignore probing errors + } + + if (!fmt->num_modifiers) { + // Hacky fallback for older drivers that don't support properly + // querying modifiers + static const uint64_t static_mods[] = { + DRM_FORMAT_MOD_INVALID, + DRM_FORMAT_MOD_LINEAR, + }; + + fmt->num_modifiers = PL_ARRAY_SIZE(static_mods); + fmt->modifiers = static_mods; + } +#endif + + // Gathering requires checking the format type (and extension presence) + if (fmt->caps & PL_FMT_CAP_SAMPLEABLE) + fmt->gatherable = p->gather_comps >= fmt->num_components; + + // Reading from textures on GLES requires FBO support for this fmt + if (p->has_readback && (p->gl_ver || (fmt->caps & PL_FMT_CAP_RENDERABLE))) + fmt->caps |= PL_FMT_CAP_HOST_READABLE; + + if (gpu->glsl.compute && fmt->glsl_format && p->has_storage) + fmt->caps |= PL_FMT_CAP_STORABLE | PL_FMT_CAP_READWRITE; + + // Only float-type formats are considered blendable in OpenGL + switch (fmt->type) { + case PL_FMT_UNKNOWN: + case PL_FMT_UINT: + case PL_FMT_SINT: + break; + case PL_FMT_FLOAT: + case PL_FMT_UNORM: + case PL_FMT_SNORM: + if (fmt->caps & PL_FMT_CAP_RENDERABLE) + fmt->caps |= PL_FMT_CAP_BLENDABLE; + break; + case PL_FMT_TYPE_COUNT: + pl_unreachable(); + } + + // TODO: Texel buffers + + PL_ARRAY_APPEND_RAW(gpu, gpu->formats, gpu->num_formats, fmt); +} + +#define DO_FORMATS(formats) \ + do { \ + for (int i = 0; i < PL_ARRAY_SIZE(formats); i++) \ + add_format(gpu, &formats[i]); \ + } while (0) + +bool gl_setup_formats(struct pl_gpu_t *gpu) +{ + struct pl_gl *p = PL_PRIV(gpu); + +#ifdef PL_HAVE_UNIX + if (p->has_modifiers) { + EGLint num_formats = 0; + bool ok = eglQueryDmaBufFormatsEXT(p->egl_dpy, 0, NULL, + &num_formats); + if (ok && num_formats) { + p->egl_formats.elem = pl_calloc(gpu, num_formats, sizeof(EGLint)); + p->egl_formats.num = num_formats; + ok = eglQueryDmaBufFormatsEXT(p->egl_dpy, num_formats, + p->egl_formats.elem, &num_formats); + pl_assert(ok); + + PL_DEBUG(gpu, "EGL formats supported:"); + for (int i = 0; i < num_formats; ++i) { + PL_DEBUG(gpu, " 0x%08x(%.4s)", p->egl_formats.elem[i], + PRINT_FOURCC(p->egl_formats.elem[i])); + } + } + } +#endif + + if (p->gl_ver >= 30) { + // Desktop GL3+ has everything + DO_FORMATS(formats_norm8); + DO_FORMATS(formats_bgra8); + DO_FORMATS(formats_norm16); + DO_FORMATS(formats_rgb16_fbo); + DO_FORMATS(formats_float); + DO_FORMATS(formats_uint); + goto done; + } + + if (p->gl_ver >= 21) { + // If we have a reasonable set of extensions, we can enable most + // things. Otherwise, pick simple fallback formats + if (pl_opengl_has_ext(p->gl, "GL_ARB_texture_float") && + pl_opengl_has_ext(p->gl, "GL_ARB_texture_rg") && + pl_opengl_has_ext(p->gl, "GL_ARB_framebuffer_object")) + { + DO_FORMATS(formats_norm8); + DO_FORMATS(formats_bgra8); + DO_FORMATS(formats_norm16); + DO_FORMATS(formats_rgb16_fbo); + DO_FORMATS(formats_float); + } else { + // Fallback for GL2 + DO_FORMATS(formats_legacy_gl2); + DO_FORMATS(formats_basic_vertex); + } + goto done; + } + + if (p->gles_ver >= 30) { + // GLES 3.0 has some basic formats, with framebuffers for float16 + // depending on GL_EXT_color_buffer_(half_)float support + DO_FORMATS(formats_norm8); + if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_norm16")) { + DO_FORMATS(formats_norm16); + DO_FORMATS(formats_rgb16_fallback); + } + if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_format_BGRA8888")) + DO_FORMATS(formats_bgra_gles); + if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_integer")) + DO_FORMATS(formats_uint); + DO_FORMATS(formats_basic_vertex); + if (p->gles_ver >= 32 || pl_opengl_has_ext(p->gl, "GL_EXT_color_buffer_half_float") + || pl_opengl_has_ext(p->gl, "GL_EXT_color_buffer_float")) { + DO_FORMATS(formats_float16_fbo); + } else { + DO_FORMATS(formats_float16_fallback); + } + goto done; + } + + if (p->gles_ver >= 20) { + // GLES 2.0 only has some legacy fallback formats, with support for + // float16 depending on GL_EXT_texture_norm16 being present + DO_FORMATS(formats_legacy_gles2); + DO_FORMATS(formats_basic_vertex); + if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_rg")) { + DO_FORMATS(formats_norm8); + } + if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_format_BGRA8888")) { + DO_FORMATS(formats_bgra_gles); + } + goto done; + } + + // Last resort fallback. Probably not very useful + DO_FORMATS(formats_basic_vertex); + goto done; + +done: + return gl_check_err(gpu, "gl_setup_formats"); +} diff --git a/src/opengl/formats.h b/src/opengl/formats.h new file mode 100644 index 0000000..b98c872 --- /dev/null +++ b/src/opengl/formats.h @@ -0,0 +1,32 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +struct gl_format { + GLint ifmt; // sized internal format (e.g. GL_RGBA16F) + GLenum fmt; // base internal format (e.g. GL_RGBA) + GLenum type; // host-visible type (e.g. GL_FLOAT) + struct pl_fmt_t tmpl; // pl_fmt template +}; + +typedef void (gl_format_cb)(pl_gpu gpu, const struct gl_format *glfmt); + +// Add all supported formats to the `pl_gpu` format list. +bool gl_setup_formats(struct pl_gpu_t *gpu); diff --git a/src/opengl/gpu.c b/src/opengl/gpu.c new file mode 100644 index 0000000..b711ac5 --- /dev/null +++ b/src/opengl/gpu.c @@ -0,0 +1,645 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "common.h" +#include "formats.h" +#include "utils.h" + +#ifdef PL_HAVE_UNIX +#include <unistd.h> +#endif + +#ifdef PL_HAVE_WIN32 +#include <windows.h> +#include <sysinfoapi.h> +#endif + +static const struct pl_gpu_fns pl_fns_gl; + +static void gl_gpu_destroy(pl_gpu gpu) +{ + struct pl_gl *p = PL_PRIV(gpu); + + pl_gpu_finish(gpu); + while (p->callbacks.num > 0) + gl_poll_callbacks(gpu); + + pl_free((void *) gpu); +} + +pl_opengl pl_opengl_get(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->destroy == gl_gpu_destroy) { + struct pl_gl *p = (struct pl_gl *) impl; + return p->gl; + } + + return NULL; +} + +static pl_handle_caps tex_handle_caps(pl_gpu gpu, bool import) +{ + pl_handle_caps caps = 0; + struct pl_gl *p = PL_PRIV(gpu); + + if (!p->egl_dpy || (!p->has_egl_storage && !p->has_egl_import)) + return 0; + + if (import) { + if (pl_opengl_has_ext(p->gl, "EGL_EXT_image_dma_buf_import")) + caps |= PL_HANDLE_DMA_BUF; + } else if (!import && p->egl_ctx) { + if (pl_opengl_has_ext(p->gl, "EGL_MESA_image_dma_buf_export")) + caps |= PL_HANDLE_DMA_BUF; + } + + return caps; +} + +static inline size_t get_page_size(void) +{ + +#ifdef PL_HAVE_UNIX + return sysconf(_SC_PAGESIZE); +#endif + +#ifdef PL_HAVE_WIN32 + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + return sysInfo.dwAllocationGranularity; +#endif + + pl_assert(!"Unsupported platform!"); +} + +#define get(pname, field) \ + do { \ + GLint tmp = 0; \ + gl->GetIntegerv((pname), &tmp); \ + *(field) = tmp; \ + } while (0) + +#define geti(pname, i, field) \ + do { \ + GLint tmp = 0; \ + gl->GetIntegeri_v((pname), i, &tmp);\ + *(field) = tmp; \ + } while (0) + +pl_gpu pl_gpu_create_gl(pl_log log, pl_opengl pl_gl, const struct pl_opengl_params *params) +{ + struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gl); + gpu->log = log; + + struct pl_gl *p = PL_PRIV(gpu); + p->impl = pl_fns_gl; + p->gl = pl_gl; + + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_glsl_version *glsl = &gpu->glsl; + glsl->gles = gl_is_gles(pl_gl); + int ver = pl_gl->major * 10 + pl_gl->minor; + p->gl_ver = glsl->gles ? 0 : ver; + p->gles_ver = glsl->gles ? ver : 0; + + // If possible, query the GLSL version from the implementation + const char *glslver = (char *) gl->GetString(GL_SHADING_LANGUAGE_VERSION); + if (glslver) { + PL_INFO(gpu, " GL_SHADING_LANGUAGE_VERSION: %s", glslver); + int major = 0, minor = 0; + if (sscanf(glslver, "%d.%d", &major, &minor) == 2) + glsl->version = major * 100 + minor; + } + + if (!glsl->version) { + // Otherwise, use the fixed magic versions 100 and 300 for GLES. + if (p->gles_ver >= 30) { + glsl->version = 300; + } else if (p->gles_ver >= 20) { + glsl->version = 100; + } else { + goto error; + } + } + + static const int glsl_ver_req = 130; + if (glsl->version < glsl_ver_req) { + PL_FATAL(gpu, "GLSL version too old (%d < %d), please use a newer " + "OpenGL implementation or downgrade libplacebo!", + glsl->version, glsl_ver_req); + goto error; + } + + if (params->max_glsl_version && params->max_glsl_version >= glsl_ver_req) { + glsl->version = PL_MIN(glsl->version, params->max_glsl_version); + PL_INFO(gpu, "Restricting GLSL version to %d... new version is %d", + params->max_glsl_version, glsl->version); + } + + if (gl_test_ext(gpu, "GL_ARB_compute_shader", 43, 0) && glsl->version >= 420) { + glsl->compute = true; + get(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &glsl->max_shmem_size); + get(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &glsl->max_group_threads); + for (int i = 0; i < 3; i++) + geti(GL_MAX_COMPUTE_WORK_GROUP_SIZE, i, &glsl->max_group_size[i]); + } + + if (gl_test_ext(gpu, "GL_ARB_texture_gather", 40, 0)) { + get(GL_MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB, &p->gather_comps); + get(GL_MIN_PROGRAM_TEXTURE_GATHER_OFFSET_ARB, &glsl->min_gather_offset); + get(GL_MAX_PROGRAM_TEXTURE_GATHER_OFFSET_ARB, &glsl->max_gather_offset); + } + + // Query all device limits + struct pl_gpu_limits *limits = &gpu->limits; + limits->thread_safe = params->make_current; + limits->callbacks = gl_test_ext(gpu, "GL_ARB_sync", 32, 30); + limits->align_vertex_stride = 1; + if (gl_test_ext(gpu, "GL_ARB_pixel_buffer_object", 31, 0)) { + limits->max_buf_size = SIZE_MAX; // no restriction imposed by GL + if (gl_test_ext(gpu, "GL_ARB_uniform_buffer_object", 31, 0)) + get(GL_MAX_UNIFORM_BLOCK_SIZE, &limits->max_ubo_size); + if (gl_test_ext(gpu, "GL_ARB_shader_storage_buffer_object", 43, 0) && + gpu->glsl.version >= 140) + { + get(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &limits->max_ssbo_size); + } + limits->max_vbo_size = limits->max_buf_size; // No additional restrictions + if (gl_test_ext(gpu, "GL_ARB_buffer_storage", 44, 0)) { + const char *vendor = (char *) gl->GetString(GL_VENDOR); + limits->max_mapped_size = limits->max_buf_size; + limits->host_cached = strcmp(vendor, "AMD") == 0 || + strcmp(vendor, "NVIDIA Corporation") == 0; + } + } + + get(GL_MAX_TEXTURE_SIZE, &limits->max_tex_2d_dim); + if (gl_test_ext(gpu, "GL_EXT_texture3D", 21, 30)) + get(GL_MAX_3D_TEXTURE_SIZE, &limits->max_tex_3d_dim); + // There's no equivalent limit for 1D textures for whatever reason, so + // just set it to the same as the 2D limit + if (p->gl_ver >= 21) + limits->max_tex_1d_dim = limits->max_tex_2d_dim; + limits->buf_transfer = true; + + if (p->gl_ver || p->gles_ver >= 30) { + get(GL_MAX_FRAGMENT_UNIFORM_COMPONENTS, &limits->max_variable_comps); + } else { + // fallback for GLES 2.0, which doesn't have max_comps + get(GL_MAX_FRAGMENT_UNIFORM_VECTORS, &limits->max_variable_comps); + limits->max_variable_comps *= 4; + } + + if (glsl->compute) { + for (int i = 0; i < 3; i++) + geti(GL_MAX_COMPUTE_WORK_GROUP_COUNT, i, &limits->max_dispatch[i]); + } + + // Query import/export support + p->egl_dpy = params->egl_display; + p->egl_ctx = params->egl_context; + p->has_egl_storage = pl_opengl_has_ext(p->gl, "GL_EXT_EGL_image_storage"); + p->has_egl_import = pl_opengl_has_ext(p->gl, "GL_OES_EGL_image_external"); + gpu->export_caps.tex = tex_handle_caps(gpu, false); + gpu->import_caps.tex = tex_handle_caps(gpu, true); + + if (p->egl_dpy) { + p->has_modifiers = pl_opengl_has_ext(p->gl, + "EGL_EXT_image_dma_buf_import_modifiers"); + } + + if (pl_opengl_has_ext(pl_gl, "GL_AMD_pinned_memory")) { + gpu->import_caps.buf |= PL_HANDLE_HOST_PTR; + gpu->limits.align_host_ptr = get_page_size(); + } + + // Cache some internal capability checks + p->has_vao = gl_test_ext(gpu, "GL_ARB_vertex_array_object", 30, 0); + p->has_invalidate_fb = gl_test_ext(gpu, "GL_ARB_invalidate_subdata", 43, 30); + p->has_invalidate_tex = gl_test_ext(gpu, "GL_ARB_invalidate_subdata", 43, 0); + p->has_queries = gl_test_ext(gpu, "GL_ARB_timer_query", 33, 0); + p->has_storage = gl_test_ext(gpu, "GL_ARB_shader_image_load_store", 42, 0); + p->has_readback = true; + + if (p->has_readback && p->gles_ver) { + GLuint fbo = 0, tex = 0; + GLint read_type = 0, read_fmt = 0; + gl->GenTextures(1, &tex); + gl->BindTexture(GL_TEXTURE_2D, tex); + gl->GenFramebuffers(1, &fbo); + gl->TexImage2D(GL_TEXTURE_2D, 0, GL_R8, 64, 64, 0, GL_RED, + GL_UNSIGNED_BYTE, NULL); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo); + gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, tex, 0); + gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type); + gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt); + if (read_type != GL_UNSIGNED_BYTE || read_fmt != GL_RED) { + PL_INFO(gpu, "GPU does not seem to support lossless texture " + "readback, restricting readback capabilities! This is a " + "GLES/driver limitation, there is little we can do to " + "work around it."); + p->has_readback = false; + } + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + gl->BindTexture(GL_TEXTURE_2D, 0); + gl->DeleteFramebuffers(1, &fbo); + gl->DeleteTextures(1, &tex); + } + + // We simply don't know, so make up some values + limits->align_tex_xfer_offset = 32; + limits->align_tex_xfer_pitch = 4; + limits->fragment_queues = 1; + limits->compute_queues = glsl->compute ? 1 : 0; + + if (!gl_check_err(gpu, "pl_gpu_create_gl")) { + PL_WARN(gpu, "Encountered errors while detecting GPU capabilities... " + "ignoring, but expect limitations/issues"); + p->failed = false; + } + + // Filter out error messages during format probing + pl_log_level_cap(gpu->log, PL_LOG_INFO); + bool formats_ok = gl_setup_formats(gpu); + pl_log_level_cap(gpu->log, PL_LOG_NONE); + if (!formats_ok) + goto error; + + return pl_gpu_finalize(gpu); + +error: + gl_gpu_destroy(gpu); + return NULL; +} + +void gl_buf_destroy(pl_gpu gpu, pl_buf buf) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) { + PL_ERR(gpu, "Failed uninitializing buffer, leaking resources!"); + return; + } + + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + if (buf_gl->fence) + gl->DeleteSync(buf_gl->fence); + + if (buf_gl->mapped) { + gl->BindBuffer(GL_COPY_WRITE_BUFFER, buf_gl->buffer); + gl->UnmapBuffer(GL_COPY_WRITE_BUFFER); + gl->BindBuffer(GL_COPY_WRITE_BUFFER, 0); + } + + gl->DeleteBuffers(1, &buf_gl->buffer); + gl_check_err(gpu, "gl_buf_destroy"); + RELEASE_CURRENT(); + pl_free((void *) buf); +} + +pl_buf gl_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return NULL; + + struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_gl); + buf->params = *params; + buf->params.initial_data = NULL; + + struct pl_gl *p = PL_PRIV(gpu); + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + buf_gl->id = ++p->buf_id; + + // Just use this since the generic GL_BUFFER doesn't work + GLenum target = GL_ARRAY_BUFFER; + const void *data = params->initial_data; + size_t total_size = params->size; + bool import = false; + + if (params->import_handle == PL_HANDLE_HOST_PTR) { + const struct pl_shared_mem *shmem = ¶ms->shared_mem; + target = GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD; + + data = shmem->handle.ptr; + buf_gl->offset = shmem->offset; + total_size = shmem->size; + import = true; + + if (params->host_mapped) + buf->data = (uint8_t *) data + buf_gl->offset; + + if (buf_gl->offset > 0 && params->drawable) { + PL_ERR(gpu, "Cannot combine non-aligned host pointer imports with " + "drawable (vertex) buffers! This is a design limitation, " + "open an issue if you absolutely need this."); + goto error; + } + } + + gl->GenBuffers(1, &buf_gl->buffer); + gl->BindBuffer(target, buf_gl->buffer); + + if (gl_test_ext(gpu, "GL_ARB_buffer_storage", 44, 0) && !import) { + + GLbitfield mapflags = 0, storflags = 0; + if (params->host_writable) + storflags |= GL_DYNAMIC_STORAGE_BIT; + if (params->host_mapped) { + mapflags |= GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | + GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; + } + if (params->memory_type == PL_BUF_MEM_HOST) + storflags |= GL_CLIENT_STORAGE_BIT; // hopefully this works + + gl->BufferStorage(target, total_size, data, storflags | mapflags); + + if (params->host_mapped) { + buf_gl->mapped = true; + buf->data = gl->MapBufferRange(target, buf_gl->offset, params->size, + mapflags); + if (!buf->data) { + gl->BindBuffer(target, 0); + if (!gl_check_err(gpu, "gl_buf_create: map")) + PL_ERR(gpu, "Failed mapping buffer: unknown reason"); + goto error; + } + } + + } else { + + // Make a random guess based on arbitrary criteria we can't know + GLenum hint = GL_STREAM_DRAW; + if (params->initial_data && !params->host_writable && !params->host_mapped) + hint = GL_STATIC_DRAW; + if (params->host_readable && !params->host_writable && !params->host_mapped) + hint = GL_STREAM_READ; + if (params->storable) + hint = GL_DYNAMIC_COPY; + + gl->BufferData(target, total_size, data, hint); + + if (import && gl->GetError() == GL_INVALID_OPERATION) { + PL_ERR(gpu, "Failed importing host pointer!"); + goto error; + } + + } + + gl->BindBuffer(target, 0); + if (!gl_check_err(gpu, "gl_buf_create")) + goto error; + + if (params->storable) { + buf_gl->barrier = GL_BUFFER_UPDATE_BARRIER_BIT | // for buf_copy etc. + GL_PIXEL_BUFFER_BARRIER_BIT | // for tex_upload + GL_SHADER_STORAGE_BARRIER_BIT; + + if (params->host_mapped) + buf_gl->barrier |= GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT; + if (params->uniform) + buf_gl->barrier |= GL_UNIFORM_BARRIER_BIT; + if (params->drawable) + buf_gl->barrier |= GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT; + } + + RELEASE_CURRENT(); + return buf; + +error: + gl_buf_destroy(gpu, buf); + RELEASE_CURRENT(); + return NULL; +} + +bool gl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + + // Non-persistently mapped buffers are always implicitly reusable in OpenGL, + // the implementation will create more buffers under the hood if needed. + if (!buf->data) + return false; + + if (!MAKE_CURRENT()) + return true; // conservative guess + + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + if (buf_gl->fence) { + GLenum res = gl->ClientWaitSync(buf_gl->fence, + timeout ? GL_SYNC_FLUSH_COMMANDS_BIT : 0, + timeout); + if (res == GL_ALREADY_SIGNALED || res == GL_CONDITION_SATISFIED) { + gl->DeleteSync(buf_gl->fence); + buf_gl->fence = NULL; + } + } + + gl_poll_callbacks(gpu); + RELEASE_CURRENT(); + return !!buf_gl->fence; +} + +void gl_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, + const void *data, size_t size) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return; + + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + gl->BindBuffer(GL_ARRAY_BUFFER, buf_gl->buffer); + gl->BufferSubData(GL_ARRAY_BUFFER, buf_gl->offset + offset, size, data); + gl->BindBuffer(GL_ARRAY_BUFFER, 0); + gl_check_err(gpu, "gl_buf_write"); + RELEASE_CURRENT(); +} + +bool gl_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, + void *dest, size_t size) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return false; + + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + gl->BindBuffer(GL_ARRAY_BUFFER, buf_gl->buffer); + gl->GetBufferSubData(GL_ARRAY_BUFFER, buf_gl->offset + offset, size, dest); + gl->BindBuffer(GL_ARRAY_BUFFER, 0); + bool ok = gl_check_err(gpu, "gl_buf_read"); + RELEASE_CURRENT(); + return ok; +} + +void gl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return; + + struct pl_buf_gl *src_gl = PL_PRIV(src); + struct pl_buf_gl *dst_gl = PL_PRIV(dst); + gl->BindBuffer(GL_COPY_READ_BUFFER, src_gl->buffer); + gl->BindBuffer(GL_COPY_WRITE_BUFFER, dst_gl->buffer); + gl->CopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, + src_gl->offset + src_offset, + dst_gl->offset + dst_offset, size); + gl_check_err(gpu, "gl_buf_copy"); + RELEASE_CURRENT(); +} + +#define QUERY_OBJECT_NUM 8 + +struct pl_timer_t { + GLuint query[QUERY_OBJECT_NUM]; + int index_write; // next index to write to + int index_read; // next index to read from +}; + +static pl_timer gl_timer_create(pl_gpu gpu) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + if (!p->has_queries || !MAKE_CURRENT()) + return NULL; + + pl_timer timer = pl_zalloc_ptr(NULL, timer); + gl->GenQueries(QUERY_OBJECT_NUM, timer->query); + RELEASE_CURRENT(); + return timer; +} + +static void gl_timer_destroy(pl_gpu gpu, pl_timer timer) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) { + PL_ERR(gpu, "Failed uninitializing timer, leaking resources!"); + return; + } + + gl->DeleteQueries(QUERY_OBJECT_NUM, timer->query); + gl_check_err(gpu, "gl_timer_destroy"); + RELEASE_CURRENT(); + pl_free(timer); +} + +static uint64_t gl_timer_query(pl_gpu gpu, pl_timer timer) +{ + if (timer->index_read == timer->index_write) + return 0; // no more unprocessed results + + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return 0; + + uint64_t res = 0; + GLuint query = timer->query[timer->index_read]; + int avail = 0; + gl->GetQueryObjectiv(query, GL_QUERY_RESULT_AVAILABLE, &avail); + if (!avail) + goto done; + gl->GetQueryObjectui64v(query, GL_QUERY_RESULT, &res); + + timer->index_read = (timer->index_read + 1) % QUERY_OBJECT_NUM; + // fall through + +done: + RELEASE_CURRENT(); + return res; +} + +void gl_timer_begin(pl_gpu gpu, pl_timer timer) +{ + if (!timer) + return; + + const gl_funcs *gl = gl_funcs_get(gpu); + gl->BeginQuery(GL_TIME_ELAPSED, timer->query[timer->index_write]); +} + +void gl_timer_end(pl_gpu gpu, pl_timer timer) +{ + if (!timer) + return; + + const gl_funcs *gl = gl_funcs_get(gpu); + gl->EndQuery(GL_TIME_ELAPSED); + + timer->index_write = (timer->index_write + 1) % QUERY_OBJECT_NUM; + if (timer->index_write == timer->index_read) { + // forcibly drop the least recent result to make space + timer->index_read = (timer->index_read + 1) % QUERY_OBJECT_NUM; + } +} + +static void gl_gpu_flush(pl_gpu gpu) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return; + + gl->Flush(); + gl_check_err(gpu, "gl_gpu_flush"); + RELEASE_CURRENT(); +} + +static void gl_gpu_finish(pl_gpu gpu) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return; + + gl->Finish(); + gl_check_err(gpu, "gl_gpu_finish"); + RELEASE_CURRENT(); +} + +static bool gl_gpu_is_failed(pl_gpu gpu) +{ + struct pl_gl *gl = PL_PRIV(gpu); + return gl->failed; +} + +static const struct pl_gpu_fns pl_fns_gl = { + .destroy = gl_gpu_destroy, + .tex_create = gl_tex_create, + .tex_destroy = gl_tex_destroy, + .tex_invalidate = gl_tex_invalidate, + .tex_clear_ex = gl_tex_clear_ex, + .tex_blit = gl_tex_blit, + .tex_upload = gl_tex_upload, + .tex_download = gl_tex_download, + .buf_create = gl_buf_create, + .buf_destroy = gl_buf_destroy, + .buf_write = gl_buf_write, + .buf_read = gl_buf_read, + .buf_copy = gl_buf_copy, + .buf_poll = gl_buf_poll, + .desc_namespace = gl_desc_namespace, + .pass_create = gl_pass_create, + .pass_destroy = gl_pass_destroy, + .pass_run = gl_pass_run, + .timer_create = gl_timer_create, + .timer_destroy = gl_timer_destroy, + .timer_query = gl_timer_query, + .gpu_flush = gl_gpu_flush, + .gpu_finish = gl_gpu_finish, + .gpu_is_failed = gl_gpu_is_failed, +}; diff --git a/src/opengl/gpu.h b/src/opengl/gpu.h new file mode 100644 index 0000000..50741d0 --- /dev/null +++ b/src/opengl/gpu.h @@ -0,0 +1,141 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "../gpu.h" +#include "common.h" + +// Thread safety: Unsafe, same as pl_gpu_destroy +pl_gpu pl_gpu_create_gl(pl_log log, pl_opengl gl, const struct pl_opengl_params *params); + +// --- pl_gpu internal structs and functions + +struct pl_gl { + struct pl_gpu_fns impl; + pl_opengl gl; + bool failed; + + // For import/export + EGLDisplay egl_dpy; + EGLContext egl_ctx; + bool egl_storage; +#ifdef PL_HAVE_UNIX + // List of formats supported by EGL_EXT_image_dma_buf_import + PL_ARRAY(EGLint) egl_formats; +#endif + + // Sync objects and associated callbacks + PL_ARRAY(struct gl_cb) callbacks; + + + // Incrementing counters to keep track of object uniqueness + int buf_id; + + // Cached capabilities + int gl_ver; + int gles_ver; + bool has_storage; + bool has_invalidate_fb; + bool has_invalidate_tex; + bool has_vao; + bool has_queries; + bool has_modifiers; + bool has_readback; + bool has_egl_storage; + bool has_egl_import; + int gather_comps; +}; + +static inline const gl_funcs *gl_funcs_get(pl_gpu gpu) +{ + struct pl_gl *p = PL_PRIV(gpu); + struct gl_ctx *glctx = PL_PRIV(p->gl); + return &glctx->func; +} + +void gl_timer_begin(pl_gpu gpu, pl_timer timer); +void gl_timer_end(pl_gpu gpu, pl_timer timer); + +static inline bool _make_current(pl_gpu gpu) +{ + struct pl_gl *p = PL_PRIV(gpu); + if (!gl_make_current(p->gl)) { + p->failed = true; + return false; + } + + return true; +} + +static inline void _release_current(pl_gpu gpu) +{ + struct pl_gl *p = PL_PRIV(gpu); + gl_release_current(p->gl); +} + +#define MAKE_CURRENT() _make_current(gpu) +#define RELEASE_CURRENT() _release_current(gpu) + +struct pl_tex_gl { + GLenum target; + GLuint texture; + bool wrapped_tex; + GLuint fbo; // or 0 + bool wrapped_fb; + GLbitfield barrier; + + // GL format fields + GLenum format; + GLint iformat; + GLenum type; + + // For imported/exported textures + EGLImageKHR image; + int fd; +}; + +pl_tex gl_tex_create(pl_gpu, const struct pl_tex_params *); +void gl_tex_destroy(pl_gpu, pl_tex); +void gl_tex_invalidate(pl_gpu, pl_tex); +void gl_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color); +void gl_tex_blit(pl_gpu, const struct pl_tex_blit_params *); +bool gl_tex_upload(pl_gpu, const struct pl_tex_transfer_params *); +bool gl_tex_download(pl_gpu, const struct pl_tex_transfer_params *); + +struct pl_buf_gl { + uint64_t id; // unique per buffer + GLuint buffer; + size_t offset; + GLsync fence; + GLbitfield barrier; + bool mapped; +}; + +pl_buf gl_buf_create(pl_gpu, const struct pl_buf_params *); +void gl_buf_destroy(pl_gpu, pl_buf); +void gl_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size); +bool gl_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size); +void gl_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size); +bool gl_buf_poll(pl_gpu, pl_buf, uint64_t timeout); + +struct pl_pass_gl; +int gl_desc_namespace(pl_gpu, enum pl_desc_type type); +pl_pass gl_pass_create(pl_gpu, const struct pl_pass_params *); +void gl_pass_destroy(pl_gpu, pl_pass); +void gl_pass_run(pl_gpu, const struct pl_pass_run_params *); diff --git a/src/opengl/gpu_pass.c b/src/opengl/gpu_pass.c new file mode 100644 index 0000000..58e69a5 --- /dev/null +++ b/src/opengl/gpu_pass.c @@ -0,0 +1,707 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "cache.h" +#include "formats.h" +#include "utils.h" + +int gl_desc_namespace(pl_gpu gpu, enum pl_desc_type type) +{ + return (int) type; +} + +struct gl_cache_header { + GLenum format; +}; + +static GLuint load_cached_program(pl_gpu gpu, pl_cache cache, pl_cache_obj *obj) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!gl_test_ext(gpu, "GL_ARB_get_program_binary", 41, 30)) + return 0; + + if (!pl_cache_get(cache, obj)) + return 0; + + if (obj->size < sizeof(struct gl_cache_header)) + return 0; + + GLuint prog = gl->CreateProgram(); + if (!gl_check_err(gpu, "load_cached_program: glCreateProgram")) + return 0; + + struct gl_cache_header *header = (struct gl_cache_header *) obj->data; + pl_str rest = (pl_str) { obj->data, obj->size }; + rest = pl_str_drop(rest, sizeof(*header)); + gl->ProgramBinary(prog, header->format, rest.buf, rest.len); + gl->GetError(); // discard potential useless error + + GLint status = 0; + gl->GetProgramiv(prog, GL_LINK_STATUS, &status); + if (status) + return prog; + + gl->DeleteProgram(prog); + gl_check_err(gpu, "load_cached_program: glProgramBinary"); + return 0; +} + +static enum pl_log_level gl_log_level(GLint status, GLint log_length) +{ + if (!status) { + return PL_LOG_ERR; + } else if (log_length > 0) { + return PL_LOG_INFO; + } else { + return PL_LOG_DEBUG; + } +} + +static bool gl_attach_shader(pl_gpu gpu, GLuint program, GLenum type, const char *src) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + GLuint shader = gl->CreateShader(type); + gl->ShaderSource(shader, 1, &src, NULL); + gl->CompileShader(shader); + + GLint status = 0; + gl->GetShaderiv(shader, GL_COMPILE_STATUS, &status); + GLint log_length = 0; + gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length); + + enum pl_log_level level = gl_log_level(status, log_length); + if (pl_msg_test(gpu->log, level)) { + GLchar *logstr = pl_zalloc(NULL, log_length + 1); + gl->GetShaderInfoLog(shader, log_length, NULL, logstr); + PL_MSG(gpu, level, "shader compile log (status=%d): %s", status, logstr); + pl_free(logstr); + } + + if (!status || !gl_check_err(gpu, "gl_attach_shader")) + goto error; + + gl->AttachShader(program, shader); + gl->DeleteShader(shader); + return true; + +error: + gl->DeleteShader(shader); + return false; +} + +static GLuint gl_compile_program(pl_gpu gpu, const struct pl_pass_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + GLuint prog = gl->CreateProgram(); + bool ok = true; + + switch (params->type) { + case PL_PASS_COMPUTE: + ok &= gl_attach_shader(gpu, prog, GL_COMPUTE_SHADER, params->glsl_shader); + break; + case PL_PASS_RASTER: + ok &= gl_attach_shader(gpu, prog, GL_VERTEX_SHADER, params->vertex_shader); + ok &= gl_attach_shader(gpu, prog, GL_FRAGMENT_SHADER, params->glsl_shader); + for (int i = 0; i < params->num_vertex_attribs; i++) + gl->BindAttribLocation(prog, i, params->vertex_attribs[i].name); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + if (!ok || !gl_check_err(gpu, "gl_compile_program: attach shader")) + goto error; + + gl->LinkProgram(prog); + GLint status = 0; + gl->GetProgramiv(prog, GL_LINK_STATUS, &status); + GLint log_length = 0; + gl->GetProgramiv(prog, GL_INFO_LOG_LENGTH, &log_length); + + enum pl_log_level level = gl_log_level(status, log_length); + if (pl_msg_test(gpu->log, level)) { + GLchar *logstr = pl_zalloc(NULL, log_length + 1); + gl->GetProgramInfoLog(prog, log_length, NULL, logstr); + PL_MSG(gpu, level, "shader link log (status=%d): %s", status, logstr); + pl_free(logstr); + } + + if (!gl_check_err(gpu, "gl_compile_program: link program")) + goto error; + + return prog; + +error: + gl->DeleteProgram(prog); + PL_ERR(gpu, "Failed compiling/linking GLSL program"); + return 0; +} + +// For pl_pass.priv +struct pl_pass_gl { + GLuint program; + GLuint vao; // the VAO object + uint64_t vao_id; // buf_gl.id of VAO + size_t vao_offset; // VBO offset of VAO + GLuint buffer; // VBO for raw vertex pointers + GLuint index_buffer; + GLint *var_locs; +}; + +void gl_pass_destroy(pl_gpu gpu, pl_pass pass) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) { + PL_ERR(gpu, "Failed uninitializing pass, leaking resources!"); + return; + } + + struct pl_pass_gl *pass_gl = PL_PRIV(pass); + if (pass_gl->vao) + gl->DeleteVertexArrays(1, &pass_gl->vao); + gl->DeleteBuffers(1, &pass_gl->index_buffer); + gl->DeleteBuffers(1, &pass_gl->buffer); + gl->DeleteProgram(pass_gl->program); + + gl_check_err(gpu, "gl_pass_destroy"); + RELEASE_CURRENT(); + pl_free((void *) pass); +} + +static void gl_update_va(pl_gpu gpu, pl_pass pass, size_t vbo_offset) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + for (int i = 0; i < pass->params.num_vertex_attribs; i++) { + const struct pl_vertex_attrib *va = &pass->params.vertex_attribs[i]; + const struct gl_format **glfmtp = PL_PRIV(va->fmt); + const struct gl_format *glfmt = *glfmtp; + + bool norm = false; + switch (va->fmt->type) { + case PL_FMT_UNORM: + case PL_FMT_SNORM: + norm = true; + break; + + case PL_FMT_UNKNOWN: + case PL_FMT_FLOAT: + case PL_FMT_UINT: + case PL_FMT_SINT: + break; + case PL_FMT_TYPE_COUNT: + pl_unreachable(); + } + + gl->EnableVertexAttribArray(i); + gl->VertexAttribPointer(i, va->fmt->num_components, glfmt->type, norm, + pass->params.vertex_stride, + (void *) (va->offset + vbo_offset)); + } +} + +pl_pass gl_pass_create(pl_gpu gpu, const struct pl_pass_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return NULL; + + struct pl_gl *p = PL_PRIV(gpu); + struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_gl); + struct pl_pass_gl *pass_gl = PL_PRIV(pass); + pl_cache cache = pl_gpu_cache(gpu); + pass->params = pl_pass_params_copy(pass, params); + + pl_cache_obj obj = { .key = CACHE_KEY_GL_PROG }; + if (cache) { + pl_hash_merge(&obj.key, pl_str0_hash(params->glsl_shader)); + if (params->type == PL_PASS_RASTER) + pl_hash_merge(&obj.key, pl_str0_hash(params->vertex_shader)); + } + + // Load/Compile program + if ((pass_gl->program = load_cached_program(gpu, cache, &obj))) { + PL_DEBUG(gpu, "Using cached GL program"); + } else { + pl_clock_t start = pl_clock_now(); + pass_gl->program = gl_compile_program(gpu, params); + pl_log_cpu_time(gpu->log, start, pl_clock_now(), "compiling shader"); + } + + if (!pass_gl->program) + goto error; + + // Update program cache if possible + if (cache && gl_test_ext(gpu, "GL_ARB_get_program_binary", 41, 30)) { + GLint buf_size = 0; + gl->GetProgramiv(pass_gl->program, GL_PROGRAM_BINARY_LENGTH, &buf_size); + if (buf_size > 0) { + buf_size += sizeof(struct gl_cache_header); + pl_cache_obj_resize(NULL, &obj, buf_size); + struct gl_cache_header *header = obj.data; + void *buffer = &header[1]; + GLsizei binary_size = 0; + gl->GetProgramBinary(pass_gl->program, buf_size, &binary_size, + &header->format, buffer); + bool ok = gl_check_err(gpu, "gl_pass_create: get program binary"); + if (ok) { + obj.size = sizeof(*header) + binary_size; + pl_assert(obj.size <= buf_size); + pl_cache_set(cache, &obj); + } + } + } + + gl->UseProgram(pass_gl->program); + pass_gl->var_locs = pl_calloc(pass, params->num_variables, sizeof(GLint)); + + for (int i = 0; i < params->num_variables; i++) { + pass_gl->var_locs[i] = gl->GetUniformLocation(pass_gl->program, + params->variables[i].name); + + // Due to OpenGL API restrictions, we need to ensure that this is a + // variable type we can actually *update*. Fortunately, this is easily + // checked by virtue of the fact that all legal combinations of + // parameters will have a valid GLSL type name + if (!pl_var_glsl_type_name(params->variables[i])) { + gl->UseProgram(0); + PL_ERR(gpu, "Input variable '%s' does not match any known type!", + params->variables[i].name); + goto error; + } + } + + for (int i = 0; i < params->num_descriptors; i++) { + const struct pl_desc *desc = ¶ms->descriptors[i]; + switch (desc->type) { + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: { + // For compatibility with older OpenGL, we need to explicitly + // update the texture/image unit bindings after creating the shader + // program, since specifying it directly requires GLSL 4.20+ + GLint loc = gl->GetUniformLocation(pass_gl->program, desc->name); + gl->Uniform1i(loc, desc->binding); + break; + } + case PL_DESC_BUF_UNIFORM: { + GLuint idx = gl->GetUniformBlockIndex(pass_gl->program, desc->name); + gl->UniformBlockBinding(pass_gl->program, idx, desc->binding); + break; + } + case PL_DESC_BUF_STORAGE: { + GLuint idx = gl->GetProgramResourceIndex(pass_gl->program, + GL_SHADER_STORAGE_BLOCK, + desc->name); + gl->ShaderStorageBlockBinding(pass_gl->program, idx, desc->binding); + break; + } + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + assert(!"unimplemented"); // TODO + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + } + + gl->UseProgram(0); + + // Initialize the VAO and single vertex buffer + gl->GenBuffers(1, &pass_gl->buffer); + if (p->has_vao) { + gl->GenVertexArrays(1, &pass_gl->vao); + gl->BindBuffer(GL_ARRAY_BUFFER, pass_gl->buffer); + gl->BindVertexArray(pass_gl->vao); + gl_update_va(gpu, pass, 0); + gl->BindVertexArray(0); + gl->BindBuffer(GL_ARRAY_BUFFER, 0); + } + + if (!gl_check_err(gpu, "gl_pass_create")) + goto error; + + pl_cache_obj_free(&obj); + RELEASE_CURRENT(); + return pass; + +error: + PL_ERR(gpu, "Failed creating pass"); + pl_cache_obj_free(&obj); + gl_pass_destroy(gpu, pass); + RELEASE_CURRENT(); + return NULL; +} + +static void update_var(pl_gpu gpu, pl_pass pass, + const struct pl_var_update *vu) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_pass_gl *pass_gl = PL_PRIV(pass); + const struct pl_var *var = &pass->params.variables[vu->index]; + GLint loc = pass_gl->var_locs[vu->index]; + + switch (var->type) { + case PL_VAR_SINT: { + const int *i = vu->data; + pl_assert(var->dim_m == 1); + switch (var->dim_v) { + case 1: gl->Uniform1iv(loc, var->dim_a, i); break; + case 2: gl->Uniform2iv(loc, var->dim_a, i); break; + case 3: gl->Uniform3iv(loc, var->dim_a, i); break; + case 4: gl->Uniform4iv(loc, var->dim_a, i); break; + default: pl_unreachable(); + } + return; + } + case PL_VAR_UINT: { + const unsigned int *u = vu->data; + pl_assert(var->dim_m == 1); + switch (var->dim_v) { + case 1: gl->Uniform1uiv(loc, var->dim_a, u); break; + case 2: gl->Uniform2uiv(loc, var->dim_a, u); break; + case 3: gl->Uniform3uiv(loc, var->dim_a, u); break; + case 4: gl->Uniform4uiv(loc, var->dim_a, u); break; + default: pl_unreachable(); + } + return; + } + case PL_VAR_FLOAT: { + const float *f = vu->data; + if (var->dim_m == 1) { + switch (var->dim_v) { + case 1: gl->Uniform1fv(loc, var->dim_a, f); break; + case 2: gl->Uniform2fv(loc, var->dim_a, f); break; + case 3: gl->Uniform3fv(loc, var->dim_a, f); break; + case 4: gl->Uniform4fv(loc, var->dim_a, f); break; + default: pl_unreachable(); + } + } else if (var->dim_m == 2 && var->dim_v == 2) { + gl->UniformMatrix2fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 3 && var->dim_v == 3) { + gl->UniformMatrix3fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 4 && var->dim_v == 4) { + gl->UniformMatrix4fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 2 && var->dim_v == 3) { + gl->UniformMatrix2x3fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 3 && var->dim_v == 2) { + gl->UniformMatrix3x2fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 2 && var->dim_v == 4) { + gl->UniformMatrix2x4fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 4 && var->dim_v == 2) { + gl->UniformMatrix4x2fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 3 && var->dim_v == 4) { + gl->UniformMatrix3x4fv(loc, var->dim_a, GL_FALSE, f); + } else if (var->dim_m == 4 && var->dim_v == 3) { + gl->UniformMatrix4x3fv(loc, var->dim_a, GL_FALSE, f); + } else { + pl_unreachable(); + } + return; + } + + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void update_desc(pl_gpu gpu, pl_pass pass, int index, + const struct pl_desc_binding *db) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + const struct pl_desc *desc = &pass->params.descriptors[index]; + + static const GLenum access[] = { + [PL_DESC_ACCESS_READWRITE] = GL_READ_WRITE, + [PL_DESC_ACCESS_READONLY] = GL_READ_ONLY, + [PL_DESC_ACCESS_WRITEONLY] = GL_WRITE_ONLY, + }; + + static const GLint wraps[PL_TEX_ADDRESS_MODE_COUNT] = { + [PL_TEX_ADDRESS_CLAMP] = GL_CLAMP_TO_EDGE, + [PL_TEX_ADDRESS_REPEAT] = GL_REPEAT, + [PL_TEX_ADDRESS_MIRROR] = GL_MIRRORED_REPEAT, + }; + + static const GLint filters[PL_TEX_SAMPLE_MODE_COUNT] = { + [PL_TEX_SAMPLE_NEAREST] = GL_NEAREST, + [PL_TEX_SAMPLE_LINEAR] = GL_LINEAR, + }; + + switch (desc->type) { + case PL_DESC_SAMPLED_TEX: { + pl_tex tex = db->object; + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + gl->ActiveTexture(GL_TEXTURE0 + desc->binding); + gl->BindTexture(tex_gl->target, tex_gl->texture); + + GLint filter = filters[db->sample_mode]; + GLint wrap = wraps[db->address_mode]; + gl->TexParameteri(tex_gl->target, GL_TEXTURE_MIN_FILTER, filter); + gl->TexParameteri(tex_gl->target, GL_TEXTURE_MAG_FILTER, filter); + switch (pl_tex_params_dimension(tex->params)) { + case 3: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_R, wrap); // fall through + case 2: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_T, wrap); // fall through + case 1: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_S, wrap); break; + } + return; + } + case PL_DESC_STORAGE_IMG: { + pl_tex tex = db->object; + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + gl->BindImageTexture(desc->binding, tex_gl->texture, 0, GL_FALSE, 0, + access[desc->access], tex_gl->iformat); + return; + } + case PL_DESC_BUF_UNIFORM: { + pl_buf buf = db->object; + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + gl->BindBufferRange(GL_UNIFORM_BUFFER, desc->binding, buf_gl->buffer, + buf_gl->offset, buf->params.size); + return; + } + case PL_DESC_BUF_STORAGE: { + pl_buf buf = db->object; + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + gl->BindBufferRange(GL_SHADER_STORAGE_BUFFER, desc->binding, buf_gl->buffer, + buf_gl->offset, buf->params.size); + return; + } + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + assert(!"unimplemented"); // TODO + + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void unbind_desc(pl_gpu gpu, pl_pass pass, int index, + const struct pl_desc_binding *db) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + const struct pl_desc *desc = &pass->params.descriptors[index]; + + switch (desc->type) { + case PL_DESC_SAMPLED_TEX: { + pl_tex tex = db->object; + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + gl->ActiveTexture(GL_TEXTURE0 + desc->binding); + gl->BindTexture(tex_gl->target, 0); + return; + } + case PL_DESC_STORAGE_IMG: { + pl_tex tex = db->object; + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + gl->BindImageTexture(desc->binding, 0, 0, GL_FALSE, 0, + GL_WRITE_ONLY, GL_R32F); + if (desc->access != PL_DESC_ACCESS_READONLY) + gl->MemoryBarrier(tex_gl->barrier); + return; + } + case PL_DESC_BUF_UNIFORM: + gl->BindBufferBase(GL_UNIFORM_BUFFER, desc->binding, 0); + return; + case PL_DESC_BUF_STORAGE: { + pl_buf buf = db->object; + struct pl_buf_gl *buf_gl = PL_PRIV(buf); + gl->BindBufferBase(GL_SHADER_STORAGE_BUFFER, desc->binding, 0); + if (desc->access != PL_DESC_ACCESS_READONLY) + gl->MemoryBarrier(buf_gl->barrier); + return; + } + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + assert(!"unimplemented"); // TODO + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +void gl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return; + + pl_pass pass = params->pass; + struct pl_pass_gl *pass_gl = PL_PRIV(pass); + struct pl_gl *p = PL_PRIV(gpu); + + gl->UseProgram(pass_gl->program); + + for (int i = 0; i < params->num_var_updates; i++) + update_var(gpu, pass, ¶ms->var_updates[i]); + for (int i = 0; i < pass->params.num_descriptors; i++) + update_desc(gpu, pass, i, ¶ms->desc_bindings[i]); + gl->ActiveTexture(GL_TEXTURE0); + + if (!gl_check_err(gpu, "gl_pass_run: updating uniforms")) { + RELEASE_CURRENT(); + return; + } + + switch (pass->params.type) { + case PL_PASS_RASTER: { + struct pl_tex_gl *target_gl = PL_PRIV(params->target); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, target_gl->fbo); + if (!pass->params.load_target && p->has_invalidate_fb) { + GLenum fb = target_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR; + gl->InvalidateFramebuffer(GL_DRAW_FRAMEBUFFER, 1, &fb); + } + + gl->Viewport(params->viewport.x0, params->viewport.y0, + pl_rect_w(params->viewport), pl_rect_h(params->viewport)); + gl->Scissor(params->scissors.x0, params->scissors.y0, + pl_rect_w(params->scissors), pl_rect_h(params->scissors)); + gl->Enable(GL_SCISSOR_TEST); + gl->Disable(GL_DEPTH_TEST); + gl->Disable(GL_CULL_FACE); + gl_check_err(gpu, "gl_pass_run: enabling viewport/scissor"); + + const struct pl_blend_params *blend = pass->params.blend_params; + if (blend) { + static const GLenum map_blend[] = { + [PL_BLEND_ZERO] = GL_ZERO, + [PL_BLEND_ONE] = GL_ONE, + [PL_BLEND_SRC_ALPHA] = GL_SRC_ALPHA, + [PL_BLEND_ONE_MINUS_SRC_ALPHA] = GL_ONE_MINUS_SRC_ALPHA, + }; + + gl->BlendFuncSeparate(map_blend[blend->src_rgb], + map_blend[blend->dst_rgb], + map_blend[blend->src_alpha], + map_blend[blend->dst_alpha]); + gl->Enable(GL_BLEND); + gl_check_err(gpu, "gl_pass_run: enabling blend"); + } + + // Update VBO and VAO + pl_buf vert = params->vertex_buf; + struct pl_buf_gl *vert_gl = vert ? PL_PRIV(vert) : NULL; + gl->BindBuffer(GL_ARRAY_BUFFER, vert ? vert_gl->buffer : pass_gl->buffer); + + if (!vert) { + // Update the buffer directly. In theory we could also do a memcmp + // cache here to avoid unnecessary updates. + gl->BufferData(GL_ARRAY_BUFFER, pl_vertex_buf_size(params), + params->vertex_data, GL_STREAM_DRAW); + } + + if (pass_gl->vao) + gl->BindVertexArray(pass_gl->vao); + + uint64_t vert_id = vert ? vert_gl->id : 0; + size_t vert_offset = vert ? params->buf_offset : 0; + if (!pass_gl->vao || pass_gl->vao_id != vert_id || + pass_gl->vao_offset != vert_offset) + { + // We need to update the VAO when the buffer ID or offset changes + gl_update_va(gpu, pass, vert_offset); + pass_gl->vao_id = vert_id; + pass_gl->vao_offset = vert_offset; + } + + gl_check_err(gpu, "gl_pass_run: update/bind vertex buffer"); + + static const GLenum map_prim[PL_PRIM_TYPE_COUNT] = { + [PL_PRIM_TRIANGLE_LIST] = GL_TRIANGLES, + [PL_PRIM_TRIANGLE_STRIP] = GL_TRIANGLE_STRIP, + }; + GLenum mode = map_prim[pass->params.vertex_type]; + + gl_timer_begin(gpu, params->timer); + + if (params->index_data) { + + static const GLenum index_fmts[PL_INDEX_FORMAT_COUNT] = { + [PL_INDEX_UINT16] = GL_UNSIGNED_SHORT, + [PL_INDEX_UINT32] = GL_UNSIGNED_INT, + }; + + // Upload indices to temporary buffer object + if (!pass_gl->index_buffer) + gl->GenBuffers(1, &pass_gl->index_buffer); // lazily allocated + gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, pass_gl->index_buffer); + gl->BufferData(GL_ELEMENT_ARRAY_BUFFER, pl_index_buf_size(params), + params->index_data, GL_STREAM_DRAW); + gl->DrawElements(mode, params->vertex_count, + index_fmts[params->index_fmt], 0); + gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); + + } else if (params->index_buf) { + + // The pointer argument becomes the index buffer offset + struct pl_buf_gl *index_gl = PL_PRIV(params->index_buf); + gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_gl->buffer); + gl->DrawElements(mode, params->vertex_count, GL_UNSIGNED_SHORT, + (void *) params->index_offset); + gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); + + } else { + + // Note: the VBO offset is handled in the VAO + gl->DrawArrays(mode, 0, params->vertex_count); + } + + gl_timer_end(gpu, params->timer); + gl_check_err(gpu, "gl_pass_run: drawing"); + + if (pass_gl->vao) { + gl->BindVertexArray(0); + } else { + for (int i = 0; i < pass->params.num_vertex_attribs; i++) + gl->DisableVertexAttribArray(i); + } + + gl->BindBuffer(GL_ARRAY_BUFFER, 0); + gl->Disable(GL_SCISSOR_TEST); + gl->Disable(GL_BLEND); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + break; + } + + case PL_PASS_COMPUTE: + gl_timer_begin(gpu, params->timer); + gl->DispatchCompute(params->compute_groups[0], + params->compute_groups[1], + params->compute_groups[2]); + gl_timer_end(gpu, params->timer); + break; + + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + for (int i = 0; i < pass->params.num_descriptors; i++) + unbind_desc(gpu, pass, i, ¶ms->desc_bindings[i]); + gl->ActiveTexture(GL_TEXTURE0); + + gl->UseProgram(0); + gl_check_err(gpu, "gl_pass_run"); + RELEASE_CURRENT(); +} diff --git a/src/opengl/gpu_tex.c b/src/opengl/gpu_tex.c new file mode 100644 index 0000000..02eda77 --- /dev/null +++ b/src/opengl/gpu_tex.c @@ -0,0 +1,1078 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "formats.h" +#include "utils.h" + +#ifdef PL_HAVE_UNIX +#include <unistd.h> +#include <errno.h> +#endif + +void gl_tex_destroy(pl_gpu gpu, pl_tex tex) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) { + PL_ERR(gpu, "Failed uninitializing texture, leaking resources!"); + return; + } + + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + if (tex_gl->fbo && !tex_gl->wrapped_fb) + gl->DeleteFramebuffers(1, &tex_gl->fbo); + if (tex_gl->image) { + struct pl_gl *p = PL_PRIV(gpu); + eglDestroyImageKHR(p->egl_dpy, tex_gl->image); + } + if (!tex_gl->wrapped_tex) + gl->DeleteTextures(1, &tex_gl->texture); + +#ifdef PL_HAVE_UNIX + if (tex_gl->fd != -1) + close(tex_gl->fd); +#endif + + gl_check_err(gpu, "gl_tex_destroy"); + RELEASE_CURRENT(); + pl_free((void *) tex); +} + +static GLbitfield tex_barrier(pl_tex tex) +{ + GLbitfield barrier = 0; + const struct pl_tex_params *params = &tex->params; + + if (params->sampleable) + barrier |= GL_TEXTURE_FETCH_BARRIER_BIT; + if (params->renderable || params->blit_src || params->blit_dst) + barrier |= GL_FRAMEBUFFER_BARRIER_BIT; + if (params->storable) + barrier |= GL_SHADER_IMAGE_ACCESS_BARRIER_BIT; + if (params->host_writable || params->host_readable) + barrier |= GL_TEXTURE_UPDATE_BARRIER_BIT; + + return barrier; +} + +#define ADD_ATTRIB(name, value) \ + do { \ + assert(num_attribs + 3 < PL_ARRAY_SIZE(attribs)); \ + attribs[num_attribs++] = (name); \ + attribs[num_attribs++] = (value); \ + } while (0) + +#define ADD_DMABUF_PLANE_ATTRIBS(plane, fd, offset, stride) \ + do { \ + ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _FD_EXT, \ + fd); \ + ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _OFFSET_EXT, \ + offset); \ + ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _PITCH_EXT, \ + stride); \ + } while (0) + +#define ADD_DMABUF_PLANE_MODIFIERS(plane, mod) \ + do { \ + ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _MODIFIER_LO_EXT, \ + (uint32_t) ((mod) & 0xFFFFFFFFlu)); \ + ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _MODIFIER_HI_EXT, \ + (uint32_t) (((mod) >> 32u) & 0xFFFFFFFFlu)); \ + } while (0) + +static bool gl_tex_import(pl_gpu gpu, + enum pl_handle_type handle_type, + const struct pl_shared_mem *shared_mem, + struct pl_tex_t *tex) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + if (!MAKE_CURRENT()) + return false; + + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + const struct pl_tex_params *params = &tex->params; + + int attribs[20] = {}; + int num_attribs = 0; + ADD_ATTRIB(EGL_WIDTH, params->w); + ADD_ATTRIB(EGL_HEIGHT, params->h); + + switch (handle_type) { + +#ifdef PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: + if (shared_mem->handle.fd == -1) { + PL_ERR(gpu, "%s: invalid fd", __func__); + goto error; + } + + tex_gl->fd = dup(shared_mem->handle.fd); + if (tex_gl->fd == -1) { + PL_ERR(gpu, "%s: cannot duplicate fd %d for importing: %s", + __func__, shared_mem->handle.fd, strerror(errno)); + goto error; + } + + ADD_ATTRIB(EGL_LINUX_DRM_FOURCC_EXT, params->format->fourcc); + ADD_DMABUF_PLANE_ATTRIBS(0, tex_gl->fd, shared_mem->offset, + PL_DEF(shared_mem->stride_w, params->w)); + if (p->has_modifiers) + ADD_DMABUF_PLANE_MODIFIERS(0, shared_mem->drm_format_mod); + + attribs[num_attribs] = EGL_NONE; + + // EGL_LINUX_DMA_BUF_EXT requires EGL_NO_CONTEXT + tex_gl->image = eglCreateImageKHR(p->egl_dpy, + EGL_NO_CONTEXT, + EGL_LINUX_DMA_BUF_EXT, + (EGLClientBuffer) NULL, + attribs); + + break; +#else // !PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: + pl_unreachable(); +#endif + + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_FD: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + pl_unreachable(); + + } + + if (!egl_check_err(gpu, "eglCreateImageKHR") || !tex_gl->image) + goto error; + + // tex_gl->image should be already bound + if (p->has_egl_storage) { + gl->EGLImageTargetTexStorageEXT(GL_TEXTURE_2D, tex_gl->image, NULL); + } else { + gl->EGLImageTargetTexture2DOES(GL_TEXTURE_2D, tex_gl->image); + } + if (!egl_check_err(gpu, "EGLImageTargetTexture2DOES")) + goto error; + + RELEASE_CURRENT(); + return true; + +error: + PL_ERR(gpu, "Failed importing GL texture!"); + RELEASE_CURRENT(); + return false; +} + +static EGLenum egl_from_gl_target(pl_gpu gpu, int target) +{ + switch(target) { + case GL_TEXTURE_2D: return EGL_GL_TEXTURE_2D; + case GL_TEXTURE_3D: return EGL_GL_TEXTURE_3D; + default: + PL_ERR(gpu, "%s: unsupported texture target 0x%x", __func__, target); + return 0; + } +} + +static bool gl_tex_export(pl_gpu gpu, enum pl_handle_type handle_type, + bool preserved, struct pl_tex_t *tex) +{ + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + struct pl_gl *p = PL_PRIV(gpu); + + EGLenum egltarget = egl_from_gl_target(gpu, tex_gl->target); + if (!egltarget) + goto error; + + int attribs[] = { + EGL_IMAGE_PRESERVED, preserved, + EGL_NONE, + }; + + // We assume that tex_gl->texture is already bound + tex_gl->image = eglCreateImageKHR(p->egl_dpy, + p->egl_ctx, + egltarget, + (EGLClientBuffer) (uintptr_t) tex_gl->texture, + attribs); + if (!egl_check_err(gpu, "eglCreateImageKHR") || !tex_gl->image) + goto error; + + switch (handle_type) { + +#ifdef PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: { + int fourcc = 0; + int num_planes = 0; + EGLuint64KHR modifier = 0; + bool ok; + ok = eglExportDMABUFImageQueryMESA(p->egl_dpy, + tex_gl->image, + &fourcc, + &num_planes, + &modifier); + if (!egl_check_err(gpu, "eglExportDMABUFImageQueryMESA") || !ok) + goto error; + + if (fourcc != tex->params.format->fourcc) { + PL_ERR(gpu, "Exported DRM format %s does not match fourcc of " + "specified pl_fmt %s? Please open a bug.", + PRINT_FOURCC(fourcc), PRINT_FOURCC(tex->params.format->fourcc)); + goto error; + } + + if (num_planes != 1) { + PL_ERR(gpu, "Unsupported number of planes: %d", num_planes); + goto error; + } + + int offset = 0, stride = 0; + ok = eglExportDMABUFImageMESA(p->egl_dpy, + tex_gl->image, + &tex_gl->fd, + &stride, + &offset); + if (!egl_check_err(gpu, "eglExportDMABUFImageMesa") || !ok) + goto error; + + off_t fdsize = lseek(tex_gl->fd, 0, SEEK_END); + off_t err = fdsize > 0 && lseek(tex_gl->fd, 0, SEEK_SET); + if (fdsize <= 0 || err < 0) { + PL_ERR(gpu, "Failed querying FD size: %s", strerror(errno)); + goto error; + } + + tex->shared_mem = (struct pl_shared_mem) { + .handle.fd = tex_gl->fd, + .size = fdsize, + .offset = offset, + .drm_format_mod = modifier, + .stride_w = stride, + }; + break; + } +#else // !PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: + pl_unreachable(); +#endif + + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_FD: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + pl_unreachable(); + + } + + return true; + +error: + PL_ERR(gpu, "Failed exporting GL texture!"); + return false; +} + +static const char *fb_err_str(GLenum err) +{ + switch (err) { +#define CASE(name) case name: return #name + CASE(GL_FRAMEBUFFER_COMPLETE); + CASE(GL_FRAMEBUFFER_UNDEFINED); + CASE(GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT); + CASE(GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT); + CASE(GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS); + CASE(GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER); + CASE(GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER); + CASE(GL_FRAMEBUFFER_UNSUPPORTED); + CASE(GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE); + CASE(GL_FRAMEBUFFER_INCOMPLETE_LAYER_TARGETS); +#undef CASE + + default: return "unknown error"; + } +} + +pl_tex gl_tex_create(pl_gpu gpu, const struct pl_tex_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return NULL; + + struct pl_gl *p = PL_PRIV(gpu); + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_gl); + tex->params = *params; + tex->params.initial_data = NULL; + tex->sampler_type = PL_SAMPLER_NORMAL; + + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + + const struct gl_format **fmtp = PL_PRIV(params->format); + const struct gl_format *fmt = *fmtp; + *tex_gl = (struct pl_tex_gl) { + .format = fmt->fmt, + .iformat = fmt->ifmt, + .type = fmt->type, + .barrier = tex_barrier(tex), + .fd = -1, + }; + + static const GLint targets[] = { + [1] = GL_TEXTURE_1D, + [2] = GL_TEXTURE_2D, + [3] = GL_TEXTURE_3D, + }; + + int dims = pl_tex_params_dimension(*params); + pl_assert(dims >= 1 && dims <= 3); + tex_gl->target = targets[dims]; + + gl->GenTextures(1, &tex_gl->texture); + gl->BindTexture(tex_gl->target, tex_gl->texture); + + if (params->import_handle) { + if (!gl_tex_import(gpu, params->import_handle, ¶ms->shared_mem, tex)) + goto error; + } else { + gl->PixelStorei(GL_UNPACK_ALIGNMENT, 1); + + switch (dims) { + case 1: + gl->TexImage1D(tex_gl->target, 0, tex_gl->iformat, params->w, 0, + tex_gl->format, tex_gl->type, params->initial_data); + break; + case 2: + gl->TexImage2D(tex_gl->target, 0, tex_gl->iformat, params->w, params->h, + 0, tex_gl->format, tex_gl->type, params->initial_data); + break; + case 3: + gl->TexImage3D(tex_gl->target, 0, tex_gl->iformat, params->w, params->h, + params->d, 0, tex_gl->format, tex_gl->type, + params->initial_data); + break; + } + + gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4); + } + + if (params->export_handle) { + if (!gl_tex_export(gpu, params->export_handle, params->initial_data, tex)) + goto error; + } + + gl->BindTexture(tex_gl->target, 0); + + if (!gl_check_err(gpu, "gl_tex_create: texture")) + goto error; + + bool need_fbo = tex->params.renderable; + if (tex->params.blit_src || tex->params.blit_dst) { + if (dims != 2) { + PL_ERR(gpu, "Blittable textures may only be 2D!"); + goto error; + } + + need_fbo = true; + } + + bool can_fbo = tex->params.format->caps & PL_FMT_CAP_RENDERABLE && + tex->params.d == 0; + + // Try creating an FBO for host-readable textures, since this allows + // reading back with glReadPixels instead of glGetTexImage. (Additionally, + // GLES does not support glGetTexImage) + if (tex->params.host_readable && (can_fbo || p->gles_ver)) + need_fbo = true; + + if (need_fbo) { + if (!can_fbo) { + PL_ERR(gpu, "Trying to create a renderable/blittable/readable " + "texture with an incompatible (non-renderable) format!"); + goto error; + } + + gl->GenFramebuffers(1, &tex_gl->fbo); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo); + switch (dims) { + case 1: + gl->FramebufferTexture1D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_1D, tex_gl->texture, 0); + break; + case 2: + gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, tex_gl->texture, 0); + break; + case 3: pl_unreachable(); + } + + GLenum err = gl->CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER); + if (err != GL_FRAMEBUFFER_COMPLETE) { + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + PL_ERR(gpu, "Failed creating framebuffer: %s", fb_err_str(err)); + goto error; + } + + if (params->host_readable && p->gles_ver) { + GLint read_type = 0, read_fmt = 0; + gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type); + gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt); + if (read_type != tex_gl->type || read_fmt != tex_gl->format) { + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + PL_ERR(gpu, "Trying to create host_readable texture whose " + "implementation-defined pixel read format " + "(type=0x%X, fmt=0x%X) does not match the texture's " + "internal format (type=0x%X, fmt=0x%X)! This is a " + "GLES/driver limitation, there's little we can do " + "about it.", + read_type, read_fmt, tex_gl->type, tex_gl->format); + goto error; + } + } + + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + if (!gl_check_err(gpu, "gl_tex_create: fbo")) + goto error; + } + + RELEASE_CURRENT(); + return tex; + +error: + gl_tex_destroy(gpu, tex); + RELEASE_CURRENT(); + return NULL; +} + +static bool gl_fb_query(pl_gpu gpu, int fbo, struct pl_fmt_t *fmt, + struct gl_format *glfmt) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + *fmt = (struct pl_fmt_t) { + .name = "fbo", + .type = PL_FMT_UNKNOWN, + .caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE | PL_FMT_CAP_BLENDABLE, + .num_components = 4, + .component_depth = {8, 8, 8, 8}, // default to rgba8 + .sample_order = {0, 1, 2, 3}, + }; + + *glfmt = (struct gl_format) { + .fmt = GL_RGBA, + }; + + bool can_query = gl_test_ext(gpu, "GL_ARB_framebuffer_object", 30, 20); + if (!fbo && p->gles_ver && p->gles_ver < 30) + can_query = false; // can't query default framebuffer on GLES 2.0 + + if (can_query) { + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo); + + GLenum obj = p->gles_ver ? GL_BACK : GL_BACK_LEFT; + if (fbo != 0) + obj = GL_COLOR_ATTACHMENT0; + + GLint type = 0; + gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj, + GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE, &type); + switch (type) { + case GL_FLOAT: fmt->type = PL_FMT_FLOAT; break; + case GL_INT: fmt->type = PL_FMT_SINT; break; + case GL_UNSIGNED_INT: fmt->type = PL_FMT_UINT; break; + case GL_SIGNED_NORMALIZED: fmt->type = PL_FMT_SNORM; break; + case GL_UNSIGNED_NORMALIZED: fmt->type = PL_FMT_UNORM; break; + default: fmt->type = PL_FMT_UNKNOWN; break; + } + + gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj, + GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE, &fmt->component_depth[0]); + gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj, + GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &fmt->component_depth[1]); + gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj, + GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE, &fmt->component_depth[2]); + gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj, + GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE, &fmt->component_depth[3]); + + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + gl_check_err(gpu, "gl_fb_query"); + + if (!fmt->component_depth[0]) { + PL_INFO(gpu, "OpenGL framebuffer did not export depth information," + "assuming 8-bit framebuffer"); + for (int i = 0; i < PL_ARRAY_SIZE(fmt->component_depth); i++) + fmt->component_depth[i] = 8; + } + + // Strip missing components from component map + while (!fmt->component_depth[fmt->num_components - 1]) { + fmt->num_components--; + pl_assert(fmt->num_components); + } + } + + int gpu_bits = 0; + for (int i = 0; i < 4; i++) + gpu_bits += fmt->component_depth[i]; + fmt->internal_size = (gpu_bits + 7) / 8; + + size_t host_size = 0; + switch (fmt->type) { + case PL_FMT_UNKNOWN: + fmt->opaque = true; + return true; + case PL_FMT_FLOAT: + glfmt->type = GL_FLOAT; + host_size = sizeof(float); + break; + case PL_FMT_UNORM: + case PL_FMT_UINT: + if (gpu_bits > 32) { + glfmt->type = GL_UNSIGNED_SHORT; + host_size = sizeof(uint16_t); + } else { + glfmt->type = GL_UNSIGNED_BYTE; + host_size = sizeof(uint8_t); + } + break; + case PL_FMT_SNORM: + case PL_FMT_SINT: + if (gpu_bits > 32) { + glfmt->type = GL_SHORT; + host_size = sizeof(int16_t); + } else { + glfmt->type = GL_BYTE; + host_size = sizeof(int8_t); + } + break; + case PL_FMT_TYPE_COUNT: + pl_unreachable(); + } + + fmt->texel_size = fmt->num_components * host_size; + for (int i = 0; i < fmt->num_components; i++) + fmt->host_bits[i] = 8 * host_size; + fmt->caps |= PL_FMT_CAP_HOST_READABLE; + + return true; +} + +pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return NULL; + + struct pl_gl *p = PL_PRIV(gpu); + struct pl_tex_t *tex = pl_alloc_obj(NULL, tex, struct pl_tex_gl); + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + *tex = (struct pl_tex_t) { + .params = { + .w = params->width, + .h = params->height, + .d = params->depth, + }, + }; + + pl_fmt fmt = NULL; + const struct gl_format *glfmt = NULL; + + if (params->texture) { + // Wrapping texture: Require matching iformat + pl_assert(params->iformat); + for (int i = 0; i < gpu->num_formats; i++) { + const struct gl_format **glfmtp = PL_PRIV(gpu->formats[i]); + if ((*glfmtp)->ifmt == params->iformat) { + fmt = gpu->formats[i]; + glfmt = *glfmtp; + break; + } + } + + if (!fmt) { + PL_ERR(gpu, "Failed mapping iformat %d to any equivalent `pl_fmt`", + params->iformat); + goto error; + } + } else { + // Wrapping framebuffer: Allocate/infer generic FBO format + fmt = pl_alloc_obj((void *) gpu, fmt, const struct gl_format *); + glfmt = pl_alloc_ptr((void *) fmt, glfmt); + const struct gl_format **glfmtp = PL_PRIV(fmt); + *glfmtp = glfmt; + if (!gl_fb_query(gpu, params->framebuffer, + (struct pl_fmt_t *) fmt, + (struct gl_format *) glfmt)) + { + PL_ERR(gpu, "Failed querying framebuffer specifics!"); + pl_free((void *) fmt); + goto error; + } + } + + *tex_gl = (struct pl_tex_gl) { + .target = params->target, + .texture = params->texture, + .fbo = params->framebuffer, + .wrapped_tex = !!params->texture, + .wrapped_fb = params->framebuffer || !params->texture, + .iformat = glfmt->ifmt, + .format = glfmt->fmt, + .type = glfmt->type, + .fd = -1, + }; + + int dims = pl_tex_params_dimension(tex->params); + if (!tex_gl->target) { + switch (dims) { + case 1: tex_gl->target = GL_TEXTURE_1D; break; + case 2: tex_gl->target = GL_TEXTURE_2D; break; + case 3: tex_gl->target = GL_TEXTURE_3D; break; + } + } + + // Map texture-specific sampling metadata + if (params->texture) { + switch (params->target) { + case GL_TEXTURE_1D: + if (params->width || params->depth) { + PL_ERR(gpu, "Invalid texture dimensions for GL_TEXTURE_1D"); + goto error; + } + // fall through + case GL_TEXTURE_2D: + if (params->depth) { + PL_ERR(gpu, "Invalid texture dimensions for GL_TEXTURE_2D"); + goto error; + } + // fall through + case 0: + case GL_TEXTURE_3D: + tex->sampler_type = PL_SAMPLER_NORMAL; + break; + + case GL_TEXTURE_RECTANGLE: tex->sampler_type = PL_SAMPLER_RECT; break; + case GL_TEXTURE_EXTERNAL_OES: tex->sampler_type = PL_SAMPLER_EXTERNAL; break; + + default: + PL_ERR(gpu, "Failed mapping texture target %u to any equivalent " + "`pl_sampler_type`", params->target); + goto error; + } + } + + // Create optional extra fbo if needed/possible + bool can_fbo = tex_gl->texture && + (fmt->caps & PL_FMT_CAP_RENDERABLE) && + tex->sampler_type != PL_SAMPLER_EXTERNAL && + dims < 3; + + if (can_fbo && !tex_gl->fbo) { + gl->GenFramebuffers(1, &tex_gl->fbo); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo); + switch (dims) { + case 1: + gl->FramebufferTexture1D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + tex_gl->target, tex_gl->texture, 0); + break; + case 2: + gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + tex_gl->target, tex_gl->texture, 0); + break; + } + + GLenum err = gl->CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER); + if (err != GL_FRAMEBUFFER_COMPLETE) { + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + PL_ERR(gpu, "Failed creating framebuffer: error code %d", err); + goto error; + } + + if (p->gles_ver) { + GLint read_type = 0, read_fmt = 0; + gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type); + gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt); + tex->params.host_readable = read_type == tex_gl->type && + read_fmt == tex_gl->format; + } else { + tex->params.host_readable = true; + } + + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + if (!gl_check_err(gpu, "pl_opengl_wrap: fbo")) + goto error; + } + + // Complete the process of inferring the texture capabilities + tex->params.format = fmt; + if (tex_gl->texture) { + tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE; + tex->params.storable = fmt->caps & PL_FMT_CAP_STORABLE; + tex->params.host_writable = !fmt->opaque; + tex->params.host_readable |= fmt->caps & PL_FMT_CAP_HOST_READABLE; + } + if (tex_gl->fbo || tex_gl->wrapped_fb) { + tex->params.renderable = fmt->caps & PL_FMT_CAP_RENDERABLE; + tex->params.host_readable |= fmt->caps & PL_FMT_CAP_HOST_READABLE; + if (dims == 2 && (fmt->caps & PL_FMT_CAP_BLITTABLE)) { + tex->params.blit_src = true; + tex->params.blit_dst = true; + } + } + + tex_gl->barrier = tex_barrier(tex); + RELEASE_CURRENT(); + return tex; + +error: + gl_tex_destroy(gpu, tex); + RELEASE_CURRENT(); + return NULL; +} + +unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex, + unsigned int *out_target, int *out_iformat, + unsigned int *out_fbo) +{ + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + if (!tex_gl->texture) { + PL_ERR(gpu, "Trying to call `pl_opengl_unwrap` on a pseudo-texture " + "(perhaps obtained by `pl_swapchain_start_frame`?)"); + return 0; + } + + if (out_target) + *out_target = tex_gl->target; + if (out_iformat) + *out_iformat = tex_gl->iformat; + if (out_fbo) + *out_fbo = tex_gl->fbo; + + return tex_gl->texture; +} + +void gl_tex_invalidate(pl_gpu gpu, pl_tex tex) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + if (!MAKE_CURRENT()) + return; + + if (tex_gl->texture && p->has_invalidate_tex) + gl->InvalidateTexImage(tex_gl->texture, 0); + + if ((tex_gl->wrapped_fb || tex_gl->fbo) && p->has_invalidate_fb) { + GLenum attachment = tex_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR; + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo); + gl->InvalidateFramebuffer(GL_DRAW_FRAMEBUFFER, 1, &attachment); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + } + + gl_check_err(gpu, "gl_tex_invalidate"); + RELEASE_CURRENT(); +} + +void gl_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return; + + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + pl_assert(tex_gl->fbo || tex_gl->wrapped_fb); + + switch (tex->params.format->type) { + case PL_FMT_UNKNOWN: + case PL_FMT_FLOAT: + case PL_FMT_UNORM: + case PL_FMT_SNORM: + gl->ClearColor(color.f[0], color.f[1], color.f[2], color.f[3]); + break; + + case PL_FMT_UINT: + gl->ClearColorIuiEXT(color.u[0], color.u[1], color.u[2], color.u[3]); + break; + + case PL_FMT_SINT: + gl->ClearColorIiEXT(color.i[0], color.i[1], color.i[2], color.i[3]); + break; + + case PL_FMT_TYPE_COUNT: + pl_unreachable(); + } + + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo); + gl->Clear(GL_COLOR_BUFFER_BIT); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + gl_check_err(gpu, "gl_tex_clear"); + RELEASE_CURRENT(); +} + +void gl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + if (!MAKE_CURRENT()) + return; + + struct pl_tex_gl *src_gl = PL_PRIV(params->src); + struct pl_tex_gl *dst_gl = PL_PRIV(params->dst); + + pl_assert(src_gl->fbo || src_gl->wrapped_fb); + pl_assert(dst_gl->fbo || dst_gl->wrapped_fb); + gl->BindFramebuffer(GL_READ_FRAMEBUFFER, src_gl->fbo); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_gl->fbo); + + static const GLint filters[PL_TEX_SAMPLE_MODE_COUNT] = { + [PL_TEX_SAMPLE_NEAREST] = GL_NEAREST, + [PL_TEX_SAMPLE_LINEAR] = GL_LINEAR, + }; + + pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc; + gl->BlitFramebuffer(src_rc.x0, src_rc.y0, src_rc.x1, src_rc.y1, + dst_rc.x0, dst_rc.y0, dst_rc.x1, dst_rc.y1, + GL_COLOR_BUFFER_BIT, filters[params->sample_mode]); + + gl->BindFramebuffer(GL_READ_FRAMEBUFFER, 0); + gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + gl_check_err(gpu, "gl_tex_blit"); + RELEASE_CURRENT(); +} + +static int get_alignment(size_t pitch) +{ + if (pitch % 8 == 0) + return 8; + if (pitch % 4 == 0) + return 4; + if (pitch % 2 == 0) + return 2; + return 1; +} + +bool gl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_buf buf = params->buf; + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + struct pl_buf_gl *buf_gl = buf ? PL_PRIV(buf) : NULL; + + // If the user requests asynchronous uploads, it's more efficient to do + // them via a PBO - this allows us to skip blocking the caller, especially + // when the host pointer can be imported directly. + if (params->callback && !buf) { + size_t buf_size = pl_tex_transfer_size(params); + const size_t min_size = 32*1024; // 32 KiB + if (buf_size >= min_size && buf_size <= gpu->limits.max_buf_size) + return pl_tex_upload_pbo(gpu, params); + } + + if (!MAKE_CURRENT()) + return false; + + uintptr_t src = (uintptr_t) params->ptr; + if (buf) { + gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, buf_gl->buffer); + src = buf_gl->offset + params->buf_offset; + } + + bool misaligned = params->row_pitch % fmt->texel_size; + int stride_w = params->row_pitch / fmt->texel_size; + int stride_h = params->depth_pitch / params->row_pitch; + + int dims = pl_tex_params_dimension(tex->params); + if (dims > 1) + gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(params->row_pitch)); + + int rows = pl_rect_h(params->rc); + if (misaligned) { + rows = 1; + } else if (stride_w != pl_rect_w(params->rc)) { + gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride_w); + } + + int imgs = pl_rect_d(params->rc); + if (stride_h != pl_rect_h(params->rc) || rows < stride_h) + gl->PixelStorei(GL_UNPACK_IMAGE_HEIGHT, stride_h); + + gl->BindTexture(tex_gl->target, tex_gl->texture); + gl_timer_begin(gpu, params->timer); + + switch (dims) { + case 1: + gl->TexSubImage1D(tex_gl->target, 0, params->rc.x0, pl_rect_w(params->rc), + tex_gl->format, tex_gl->type, (void *) src); + break; + case 2: + for (int y = params->rc.y0; y < params->rc.y1; y += rows) { + gl->TexSubImage2D(tex_gl->target, 0, params->rc.x0, y, + pl_rect_w(params->rc), rows, tex_gl->format, + tex_gl->type, (void *) src); + src += params->row_pitch * rows; + } + break; + case 3: + for (int z = params->rc.z0; z < params->rc.z1; z += imgs) { + uintptr_t row_src = src; + for (int y = params->rc.y0; y < params->rc.y1; y += rows) { + gl->TexSubImage3D(tex_gl->target, 0, params->rc.x0, y, z, + pl_rect_w(params->rc), rows, imgs, + tex_gl->format, tex_gl->type, (void *) row_src); + row_src = (uintptr_t) row_src + params->row_pitch * rows; + } + src += params->depth_pitch * imgs; + } + break; + } + + gl_timer_end(gpu, params->timer); + gl->BindTexture(tex_gl->target, 0); + gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4); + gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0); + gl->PixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0); + + if (buf) { + gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + if (buf->params.host_mapped) { + // Make sure the PBO is not reused until GL is done with it. If a + // previous operation is pending, "update" it by creating a new + // fence that will cover the previous operation as well. + gl->DeleteSync(buf_gl->fence); + buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + } + } + + if (params->callback) { + PL_ARRAY_APPEND(gpu, p->callbacks, (struct gl_cb) { + .sync = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0), + .callback = params->callback, + .priv = params->priv, + }); + } + + bool ok = gl_check_err(gpu, "gl_tex_upload"); + RELEASE_CURRENT(); + return ok; +} + +bool gl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + pl_buf buf = params->buf; + struct pl_tex_gl *tex_gl = PL_PRIV(tex); + struct pl_buf_gl *buf_gl = buf ? PL_PRIV(buf) : NULL; + bool ok = true; + + if (params->callback && !buf) { + size_t buf_size = pl_tex_transfer_size(params); + const size_t min_size = 32*1024; // 32 KiB + if (buf_size >= min_size && buf_size <= gpu->limits.max_buf_size) + return pl_tex_download_pbo(gpu, params); + } + + if (!MAKE_CURRENT()) + return false; + + uintptr_t dst = (uintptr_t) params->ptr; + if (buf) { + gl->BindBuffer(GL_PIXEL_PACK_BUFFER, buf_gl->buffer); + dst = buf_gl->offset + params->buf_offset; + } + + pl_rect3d full = { + 0, 0, 0, + tex->params.w, + PL_DEF(tex->params.h, 1), + PL_DEF(tex->params.d, 1), + }; + + bool misaligned = params->row_pitch % fmt->texel_size; + int stride_w = params->row_pitch / fmt->texel_size; + int stride_h = params->depth_pitch / params->row_pitch; + + int dims = pl_tex_params_dimension(tex->params); + bool is_copy = pl_rect3d_eq(params->rc, full) && + stride_w == tex->params.w && + stride_h == PL_DEF(tex->params.h, 1) && + !misaligned; + + gl_timer_begin(gpu, params->timer); + + if (tex_gl->fbo || tex_gl->wrapped_fb) { + // We can use a more efficient path when we have an FBO available + if (dims > 1) + gl->PixelStorei(GL_PACK_ALIGNMENT, get_alignment(params->row_pitch)); + + int rows = pl_rect_h(params->rc); + if (misaligned) { + rows = 1; + } else if (stride_w != tex->params.w) { + gl->PixelStorei(GL_PACK_ROW_LENGTH, stride_w); + } + + // No 3D framebuffers + pl_assert(pl_rect_d(params->rc) == 1); + + gl->BindFramebuffer(GL_READ_FRAMEBUFFER, tex_gl->fbo); + for (int y = params->rc.y0; y < params->rc.y1; y += rows) { + gl->ReadPixels(params->rc.x0, y, pl_rect_w(params->rc), rows, + tex_gl->format, tex_gl->type, (void *) dst); + dst += params->row_pitch * rows; + } + gl->BindFramebuffer(GL_READ_FRAMEBUFFER, 0); + gl->PixelStorei(GL_PACK_ALIGNMENT, 4); + gl->PixelStorei(GL_PACK_ROW_LENGTH, 0); + } else if (is_copy) { + // We're downloading the entire texture + gl->BindTexture(tex_gl->target, tex_gl->texture); + gl->GetTexImage(tex_gl->target, 0, tex_gl->format, tex_gl->type, (void *) dst); + gl->BindTexture(tex_gl->target, 0); + } else { + PL_ERR(gpu, "Partial downloads of 3D textures not implemented!"); + ok = false; + } + + gl_timer_end(gpu, params->timer); + + if (buf) { + gl->BindBuffer(GL_PIXEL_PACK_BUFFER, 0); + if (ok && buf->params.host_mapped) { + gl->DeleteSync(buf_gl->fence); + buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + } + } + + if (params->callback) { + PL_ARRAY_APPEND(gpu, p->callbacks, (struct gl_cb) { + .sync = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0), + .callback = params->callback, + .priv = params->priv, + }); + } + + ok &= gl_check_err(gpu, "gl_tex_download"); + RELEASE_CURRENT(); + return ok; +} diff --git a/src/opengl/include/glad/meson.build b/src/opengl/include/glad/meson.build new file mode 100644 index 0000000..05b3f02 --- /dev/null +++ b/src/opengl/include/glad/meson.build @@ -0,0 +1,29 @@ +glad_check = run_command([ python, '-c', 'import glad; print(glad.__version__)' ], + env: python_env, + capture: true, + check: false, +) + +glad_ver = glad_check.returncode() == 0 ? glad_check.stdout().strip() : 'none' +glad_req = '>= 2.0' + +if not glad_ver.version_compare(glad_req) + error(f'glad (required: @glad_req@, found: @glad_ver@) was not found in ' + + 'PYTHONPATH or `3rdparty`. Please run `git submodule update --init` ' + + 'followed by `meson --wipe`.') +endif + +glad = custom_target('gl.h', + output: 'gl.h', + env: python_env, + command: [ + python, '-m', 'glad', '--out-path=@OUTDIR@/../../', + '--reproducible', '--merge', '--api=gl:core,gles2,egl', + '--extensions=' + ','.join(gl_extensions), 'c', '--header-only', '--mx' + ] + (opengl_link.allowed() ? ['--loader'] : []) +) + +glad_dep = declare_dependency( + include_directories: include_directories('..'), + sources: glad, +) diff --git a/src/opengl/loader_egl.c b/src/opengl/loader_egl.c new file mode 100644 index 0000000..0e04c71 --- /dev/null +++ b/src/opengl/loader_egl.c @@ -0,0 +1,2 @@ +#define GLAD_EGL_IMPLEMENTATION +#include "common.h" diff --git a/src/opengl/loader_gl.c b/src/opengl/loader_gl.c new file mode 100644 index 0000000..26b8bef --- /dev/null +++ b/src/opengl/loader_gl.c @@ -0,0 +1,2 @@ +#define GLAD_GL_IMPLEMENTATION +#include "common.h" diff --git a/src/opengl/meson.build b/src/opengl/meson.build new file mode 100644 index 0000000..59ba921 --- /dev/null +++ b/src/opengl/meson.build @@ -0,0 +1,76 @@ +opengl_build = get_option('opengl') +opengl_link = get_option('gl-proc-addr') + +if host_machine.system() == 'windows' or host_machine.system().endswith('bsd') or \ + host_machine.system() == 'dragonfly' + libdl = declare_dependency() +else + libdl = cc.find_library('dl', required : opengl_link) +endif +opengl_link = opengl_link.require(libdl.found()) +components.set('opengl', opengl_build.allowed()) +components.set('gl-proc-addr', opengl_link.allowed()) + +if opengl_build.allowed() + sources += [ + 'opengl/context.c', + 'opengl/formats.c', + 'opengl/loader_gl.c', + 'opengl/loader_egl.c', + 'opengl/gpu.c', + 'opengl/gpu_tex.c', + 'opengl/gpu_pass.c', + 'opengl/swapchain.c', + 'opengl/utils.c', + ] + + if opengl_link.allowed() + build_deps += libdl + tests += 'opengl_surfaceless.c' + endif + + gl_extensions = [ + 'GL_AMD_pinned_memory', + 'GL_ARB_buffer_storage', + 'GL_ARB_compute_shader', + 'GL_ARB_framebuffer_object', + 'GL_ARB_get_program_binary', + 'GL_ARB_invalidate_subdata', + 'GL_ARB_pixel_buffer_object', + 'GL_ARB_program_interface_query', + 'GL_ARB_shader_image_load_store', + 'GL_ARB_shader_storage_buffer_object', + 'GL_ARB_sync', + 'GL_ARB_texture_float', + 'GL_ARB_texture_gather', + 'GL_ARB_texture_rg', + 'GL_ARB_timer_query', + 'GL_ARB_uniform_buffer_object', + 'GL_ARB_vertex_array_object', + 'GL_EXT_EGL_image_storage', + 'GL_EXT_color_buffer_float', + 'GL_EXT_color_buffer_half_float', + 'GL_EXT_texture3D', + 'GL_EXT_texture_format_BGRA8888', + 'GL_EXT_texture_integer', + 'GL_EXT_texture_norm16', + 'GL_EXT_texture_rg', + 'GL_EXT_unpack_subimage', + 'GL_KHR_debug', + 'GL_OES_EGL_image', + 'GL_OES_EGL_image_external', + 'EGL_EXT_image_dma_buf_import', + 'EGL_EXT_image_dma_buf_import_modifiers', + 'EGL_EXT_platform_base', + 'EGL_KHR_debug', + 'EGL_KHR_image_base', + 'EGL_MESA_image_dma_buf_export', + 'EGL_MESA_platform_surfaceless', + ] + + # Generate GL loader + subdir('include/glad') +else + glad_dep = [] + sources += 'opengl/stubs.c' +endif diff --git a/src/opengl/stubs.c b/src/opengl/stubs.c new file mode 100644 index 0000000..20395f9 --- /dev/null +++ b/src/opengl/stubs.c @@ -0,0 +1,63 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "../common.h" +#include "log.h" + +#include <libplacebo/opengl.h> + +const struct pl_opengl_params pl_opengl_default_params = {0}; + +pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params) +{ + pl_fatal(log, "libplacebo compiled without OpenGL support!"); + return NULL; +} + +void pl_opengl_destroy(pl_opengl *pgl) +{ + pl_opengl gl = *pgl; + pl_assert(!gl); +} + +pl_opengl pl_opengl_get(pl_gpu gpu) +{ + return NULL; +} + +pl_swapchain pl_opengl_create_swapchain(pl_opengl gl, + const struct pl_opengl_swapchain_params *params) +{ + pl_unreachable(); +} + +void pl_opengl_swapchain_update_fb(pl_swapchain sw, + const struct pl_opengl_framebuffer *fb) +{ + pl_unreachable(); +} + +pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params) +{ + pl_unreachable(); +} + +unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex, unsigned int *out_target, + int *out_iformat, unsigned int *out_fbo) +{ + pl_unreachable(); +} diff --git a/src/opengl/swapchain.c b/src/opengl/swapchain.c new file mode 100644 index 0000000..46d5f9e --- /dev/null +++ b/src/opengl/swapchain.c @@ -0,0 +1,278 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "formats.h" +#include "gpu.h" +#include "swapchain.h" +#include "utils.h" +#include "pl_thread.h" + +struct priv { + struct pl_sw_fns impl; + + struct pl_opengl_swapchain_params params; + pl_opengl gl; + pl_mutex lock; + bool has_sync; + + // current parameters + pl_tex fb; + bool frame_started; + + // vsync fences + int swapchain_depth; + PL_ARRAY(GLsync) vsync_fences; +}; + +static const struct pl_sw_fns opengl_swapchain; + +pl_swapchain pl_opengl_create_swapchain(pl_opengl pl_gl, + const struct pl_opengl_swapchain_params *params) +{ + pl_gpu gpu = pl_gl->gpu; + + if (params->max_swapchain_depth < 0) { + PL_ERR(gpu, "Tried specifying negative swapchain depth?"); + return NULL; + } + + if (!gl_make_current(pl_gl)) + return NULL; + + struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv); + sw->log = gpu->log; + sw->gpu = gpu; + + struct priv *p = PL_PRIV(sw); + pl_mutex_init(&p->lock); + p->impl = opengl_swapchain; + p->params = *params; + p->has_sync = pl_opengl_has_ext(pl_gl, "GL_ARB_sync"); + p->gl = pl_gl; + + gl_release_current(pl_gl); + return sw; +} + +static void gl_sw_destroy(pl_swapchain sw) +{ + pl_gpu gpu = sw->gpu; + struct priv *p = PL_PRIV(sw); + + pl_gpu_flush(gpu); + pl_tex_destroy(gpu, &p->fb); + pl_mutex_destroy(&p->lock); + pl_free((void *) sw); +} + +static int gl_sw_latency(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + return p->params.max_swapchain_depth; +} + +static bool gl_sw_resize(pl_swapchain sw, int *width, int *height) +{ + struct priv *p = PL_PRIV(sw); + const int w = *width, h = *height; + + pl_mutex_lock(&p->lock); + if (p->fb && w == p->fb->params.w && h == p->fb->params.h) { + pl_mutex_unlock(&p->lock); + return true; + } + + if (p->frame_started && (w || h)) { + PL_ERR(sw, "Tried resizing the swapchain while a frame was in progress! " + "Please submit the current frame first."); + pl_mutex_unlock(&p->lock); + return false; + } + + if (w && h) { + pl_tex_destroy(sw->gpu, &p->fb); + p->fb = pl_opengl_wrap(sw->gpu, pl_opengl_wrap_params( + .framebuffer = p->params.framebuffer.id, + .width = w, + .height = h, + )); + if (!p->fb) { + PL_ERR(sw, "Failed wrapping OpenGL framebuffer!"); + pl_mutex_unlock(&p->lock); + return false; + } + } + + if (!p->fb) { + PL_ERR(sw, "Tried calling `pl_swapchain_resize` with unknown size! " + "This is forbidden for OpenGL. The first call to " + "`pl_swapchain_resize` must include the width and height of the " + "swapchain, because there's no way to figure this out from " + "within the API."); + pl_mutex_unlock(&p->lock); + return false; + } + + *width = p->fb->params.w; + *height = p->fb->params.h; + pl_mutex_unlock(&p->lock); + return true; +} + +void pl_opengl_swapchain_update_fb(pl_swapchain sw, + const struct pl_opengl_framebuffer *fb) +{ + struct priv *p = PL_PRIV(sw); + pl_mutex_lock(&p->lock); + if (p->frame_started) { + PL_ERR(sw,"Tried calling `pl_opengl_swapchain_update_fb` while a frame " + "was in progress! Please submit the current frame first."); + pl_mutex_unlock(&p->lock); + return; + } + + if (p->params.framebuffer.id != fb->id) + pl_tex_destroy(sw->gpu, &p->fb); + + p->params.framebuffer = *fb; + pl_mutex_unlock(&p->lock); +} + +static bool gl_sw_start_frame(pl_swapchain sw, + struct pl_swapchain_frame *out_frame) +{ + struct priv *p = PL_PRIV(sw); + pl_mutex_lock(&p->lock); + bool ok = false; + + if (!p->fb) { + PL_ERR(sw, "Unknown framebuffer size. Please call `pl_swapchain_resize` " + "before `pl_swapchain_start_frame` for OpenGL swapchains!"); + goto error; + } + + if (p->frame_started) { + PL_ERR(sw, "Attempted calling `pl_swapchain_start` while a frame was " + "already in progress! Call `pl_swapchain_submit_frame` first."); + goto error; + } + + if (!gl_make_current(p->gl)) + goto error; + + *out_frame = (struct pl_swapchain_frame) { + .fbo = p->fb, + .flipped = !p->params.framebuffer.flipped, + .color_repr = { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_FULL, + .alpha = p->fb->params.format->num_components == 4 + ? PL_ALPHA_PREMULTIPLIED + : PL_ALPHA_UNKNOWN, + .bits = { + // Just use the red channel in the absence of anything more + // sane to do, because the red channel is both guaranteed to + // exist and also typically has the minimum number of bits + // (which is arguably what matters for dithering) + .sample_depth = p->fb->params.format->component_depth[0], + .color_depth = p->fb->params.format->component_depth[0], + }, + }, + .color_space = pl_color_space_monitor, + }; + + p->frame_started = gl_check_err(sw->gpu, "gl_sw_start_frame"); + if (!p->frame_started) + goto error; + + // keep p->lock held + gl_release_current(p->gl); + return true; + +error: + gl_release_current(p->gl); + pl_mutex_unlock(&p->lock); + return ok; +} + +static bool gl_sw_submit_frame(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + struct gl_ctx *glctx = PL_PRIV(p->gl); + const gl_funcs *gl = &glctx->func; + if (!gl_make_current(p->gl)) { + p->frame_started = false; + pl_mutex_unlock(&p->lock); + return false; + } + + pl_assert(p->frame_started); + if (p->has_sync && p->params.max_swapchain_depth) { + GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + if (fence) + PL_ARRAY_APPEND(sw, p->vsync_fences, fence); + } + + gl->Flush(); + p->frame_started = false; + bool ok = gl_check_err(sw->gpu, "gl_sw_submit_frame"); + gl_release_current(p->gl); + pl_mutex_unlock(&p->lock); + + return ok; +} + +static void gl_sw_swap_buffers(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + struct gl_ctx *glctx = PL_PRIV(p->gl); + const gl_funcs *gl = &glctx->func; + if (!p->params.swap_buffers) { + PL_ERR(sw, "`pl_swapchain_swap_buffers` called but no " + "`params.swap_buffers` callback set!"); + return; + } + + pl_mutex_lock(&p->lock); + if (!gl_make_current(p->gl)) { + pl_mutex_unlock(&p->lock); + return; + } + + p->params.swap_buffers(p->params.priv); + + const int max_depth = p->params.max_swapchain_depth; + while (max_depth && p->vsync_fences.num >= max_depth) { + gl->ClientWaitSync(p->vsync_fences.elem[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9); + gl->DeleteSync(p->vsync_fences.elem[0]); + PL_ARRAY_REMOVE_AT(p->vsync_fences, 0); + } + + gl_check_err(sw->gpu, "gl_sw_swap_buffers"); + gl_release_current(p->gl); + pl_mutex_unlock(&p->lock); +} + +static const struct pl_sw_fns opengl_swapchain = { + .destroy = gl_sw_destroy, + .latency = gl_sw_latency, + .resize = gl_sw_resize, + .start_frame = gl_sw_start_frame, + .submit_frame = gl_sw_submit_frame, + .swap_buffers = gl_sw_swap_buffers, +}; diff --git a/src/opengl/utils.c b/src/opengl/utils.c new file mode 100644 index 0000000..d96a3e7 --- /dev/null +++ b/src/opengl/utils.c @@ -0,0 +1,158 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "gpu.h" +#include "utils.h" + +const char *gl_err_str(GLenum err) +{ + switch (err) { +#define CASE(name) case name: return #name + CASE(GL_NO_ERROR); + CASE(GL_INVALID_ENUM); + CASE(GL_INVALID_VALUE); + CASE(GL_INVALID_OPERATION); + CASE(GL_INVALID_FRAMEBUFFER_OPERATION); + CASE(GL_OUT_OF_MEMORY); + CASE(GL_STACK_UNDERFLOW); + CASE(GL_STACK_OVERFLOW); +#undef CASE + + default: return "unknown error"; + } +} + +void gl_poll_callbacks(pl_gpu gpu) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + while (p->callbacks.num) { + struct gl_cb cb = p->callbacks.elem[0]; + GLenum res = gl->ClientWaitSync(cb.sync, 0, 0); + switch (res) { + case GL_ALREADY_SIGNALED: + case GL_CONDITION_SATISFIED: + PL_ARRAY_REMOVE_AT(p->callbacks, 0); + cb.callback(cb.priv); + continue; + + case GL_WAIT_FAILED: + PL_ARRAY_REMOVE_AT(p->callbacks, 0); + gl->DeleteSync(cb.sync); + p->failed = true; + gl_check_err(gpu, "gl_poll_callbacks"); // NOTE: will recurse! + return; + + case GL_TIMEOUT_EXPIRED: + return; + + default: + pl_unreachable(); + } + } +} + +bool gl_check_err(pl_gpu gpu, const char *fun) +{ + const gl_funcs *gl = gl_funcs_get(gpu); + struct pl_gl *p = PL_PRIV(gpu); + bool ret = true; + + while (true) { + GLenum error = gl->GetError(); + if (error == GL_NO_ERROR) + break; + PL_ERR(gpu, "%s: OpenGL error: %s", fun, gl_err_str(error)); + ret = false; + p->failed = true; + } + + gl_poll_callbacks(gpu); + return ret; +} + +bool gl_is_software(pl_opengl pl_gl) +{ + struct gl_ctx *glctx = PL_PRIV(pl_gl); + const gl_funcs *gl = &glctx->func; + const char *renderer = (char *) gl->GetString(GL_RENDERER); + return !renderer || + strcmp(renderer, "Software Rasterizer") == 0 || + strstr(renderer, "llvmpipe") || + strstr(renderer, "softpipe") || + strcmp(renderer, "Mesa X11") == 0 || + strcmp(renderer, "Apple Software Renderer") == 0; +} + +bool gl_is_gles(pl_opengl pl_gl) +{ + struct gl_ctx *glctx = PL_PRIV(pl_gl); + const gl_funcs *gl = &glctx->func; + const char *version = (char *) gl->GetString(GL_VERSION); + return pl_str_startswith0(pl_str0(version), "OpenGL ES"); +} + +bool gl_test_ext(pl_gpu gpu, const char *ext, int gl_ver, int gles_ver) +{ + struct pl_gl *p = PL_PRIV(gpu); + if (gl_ver && p->gl_ver >= gl_ver) + return true; + if (gles_ver && p->gles_ver >= gles_ver) + return true; + + return ext ? pl_opengl_has_ext(p->gl, ext) : false; +} + +const char *egl_err_str(EGLenum err) +{ + switch (err) { +#define CASE(name) case name: return #name + CASE(EGL_SUCCESS); + CASE(EGL_NOT_INITIALIZED); + CASE(EGL_BAD_ACCESS); + CASE(EGL_BAD_ALLOC); + CASE(EGL_BAD_ATTRIBUTE); + CASE(EGL_BAD_CONFIG); + CASE(EGL_BAD_CONTEXT); + CASE(EGL_BAD_CURRENT_SURFACE); + CASE(EGL_BAD_DISPLAY); + CASE(EGL_BAD_MATCH); + CASE(EGL_BAD_NATIVE_PIXMAP); + CASE(EGL_BAD_NATIVE_WINDOW); + CASE(EGL_BAD_PARAMETER); + CASE(EGL_BAD_SURFACE); +#undef CASE + + default: return "unknown error"; + } +} + +bool egl_check_err(pl_gpu gpu, const char *fun) +{ + struct pl_gl *p = PL_PRIV(gpu); + bool ret = true; + + while (true) { + GLenum error = eglGetError(); + if (error == EGL_SUCCESS) + return ret; + PL_ERR(gpu, "%s: EGL error: %s", fun, egl_err_str(error)); + ret = false; + p->failed = true; + } +} diff --git a/src/opengl/utils.h b/src/opengl/utils.h new file mode 100644 index 0000000..0be229d --- /dev/null +++ b/src/opengl/utils.h @@ -0,0 +1,57 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +// Iterate through callbacks attached to the `pl_gl` and execute all of the +// ones that have completed. +// +// Thread-safety: Unsafe +void gl_poll_callbacks(pl_gpu gpu); + +// Return a human-readable name for various OpenGL errors +// +// Thread-safety: Safe +const char *gl_err_str(GLenum err); + +// Check for errors and log them + return false if detected +// +// Thread-safety: Unsafe +bool gl_check_err(pl_gpu gpu, const char *fun); + +// Returns true if the context is a suspected software rasterizer +// +// Thread-safety: Unsafe +bool gl_is_software(pl_opengl gl); + +// Returns true if the context is detected as OpenGL ES +// +// Thread-safety: Unsafe +bool gl_is_gles(pl_opengl gl); + +// Check for presence of an extension, alternatively a minimum GL version +// +// Thread-safety: Unsafe +bool gl_test_ext(pl_gpu gpu, const char *ext, int gl_ver, int gles_ver); + +// Thread-safety: Safe +const char *egl_err_str(EGLenum err); + +// Thread-safety: Unsafe +bool egl_check_err(pl_gpu gpu, const char *fun); diff --git a/src/options.c b/src/options.c new file mode 100644 index 0000000..1db53bf --- /dev/null +++ b/src/options.c @@ -0,0 +1,1166 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" +#include "log.h" + +#include <libplacebo/options.h> + +struct priv { + pl_log log; + + // for pl_options_get + struct pl_opt_data_t data; + pl_str data_text; + + // for pl_options_save + pl_str saved; + + // internally managed hooks array + PL_ARRAY(const struct pl_hook *) hooks; +}; + +static const struct pl_options_t defaults = { + .params = { PL_RENDER_DEFAULTS }, + .deband_params = { PL_DEBAND_DEFAULTS }, + .sigmoid_params = { PL_SIGMOID_DEFAULTS }, + .color_adjustment = { PL_COLOR_ADJUSTMENT_NEUTRAL }, + .peak_detect_params = { PL_PEAK_DETECT_DEFAULTS }, + .color_map_params = { PL_COLOR_MAP_DEFAULTS }, + .dither_params = { PL_DITHER_DEFAULTS }, + .icc_params = { PL_ICC_DEFAULTS }, + .cone_params = { PL_CONE_NONE, 1.0 }, + .deinterlace_params = { PL_DEINTERLACE_DEFAULTS }, + .distort_params = { PL_DISTORT_DEFAULTS }, + .upscaler = { + .name = "custom", + .description = "Custom upscaler", + .allowed = PL_FILTER_UPSCALING, + }, + .downscaler = { + .name = "custom", + .description = "Custom downscaler", + .allowed = PL_FILTER_DOWNSCALING, + }, + .plane_upscaler = { + .name = "custom", + .description = "Custom plane upscaler", + .allowed = PL_FILTER_UPSCALING, + }, + .plane_downscaler = { + .name = "custom", + .description = "Custom plane downscaler", + .allowed = PL_FILTER_DOWNSCALING, + }, + .frame_mixer = { + .name = "custom", + .description = "Custom frame mixer", + .allowed = PL_FILTER_FRAME_MIXING, + }, +}; + +// Copies only whitelisted fields +static inline void copy_filter(struct pl_filter_config *dst, + const struct pl_filter_config *src) +{ + dst->kernel = src->kernel; + dst->window = src->window; + dst->radius = src->radius; + dst->clamp = src->clamp; + dst->blur = src->blur; + dst->taper = src->taper; + dst->polar = src->polar; + for (int i = 0; i < PL_FILTER_MAX_PARAMS; i++) { + dst->params[i] = src->params[i]; + dst->wparams[i] = src->wparams[i]; + } +} + +static inline void redirect_params(pl_options opts) +{ + // Copy all non-NULL params structs into pl_options and redirect them +#define REDIRECT_PARAMS(field) do \ +{ \ + if (opts->params.field) { \ + opts->field = *opts->params.field; \ + opts->params.field = &opts->field; \ + } \ +} while (0) + + REDIRECT_PARAMS(deband_params); + REDIRECT_PARAMS(sigmoid_params); + REDIRECT_PARAMS(color_adjustment); + REDIRECT_PARAMS(peak_detect_params); + REDIRECT_PARAMS(color_map_params); + REDIRECT_PARAMS(dither_params); + REDIRECT_PARAMS(icc_params); + REDIRECT_PARAMS(cone_params); + REDIRECT_PARAMS(deinterlace_params); + REDIRECT_PARAMS(distort_params); +} + +void pl_options_reset(pl_options opts, const struct pl_render_params *preset) +{ + *opts = defaults; + if (preset) + opts->params = *preset; + redirect_params(opts); + + // Make a copy of all scaler configurations that aren't built-in filters + struct { + bool upscaler; + bool downscaler; + bool plane_upscaler; + bool plane_downscaler; + bool frame_mixer; + } fixed = {0}; + + for (int i = 0; i < pl_num_filter_configs; i++) { + const struct pl_filter_config *f = pl_filter_configs[i]; + fixed.upscaler |= f == opts->params.upscaler; + fixed.downscaler |= f == opts->params.downscaler; + fixed.plane_upscaler |= f == opts->params.plane_upscaler; + fixed.plane_downscaler |= f == opts->params.plane_downscaler; + fixed.frame_mixer |= f == opts->params.frame_mixer; + } + +#define REDIRECT_SCALER(scaler) do \ +{ \ + if (opts->params.scaler && !fixed.scaler) { \ + copy_filter(&opts->scaler, opts->params.scaler); \ + opts->params.scaler = &opts->scaler; \ + } \ +} while (0) + + REDIRECT_SCALER(upscaler); + REDIRECT_SCALER(downscaler); + REDIRECT_SCALER(plane_upscaler); + REDIRECT_SCALER(plane_downscaler); + REDIRECT_SCALER(frame_mixer); +} + +pl_options pl_options_alloc(pl_log log) +{ + struct pl_options_t *opts = pl_zalloc_obj(NULL, opts, struct priv); + struct priv *p = PL_PRIV(opts); + pl_options_reset(opts, NULL); + p->log = log; + return opts; +} + +void pl_options_free(pl_options *popts) +{ + pl_free_ptr((void **) popts); +} + +static void make_hooks_internal(pl_options opts) +{ + struct priv *p = PL_PRIV(opts); + struct pl_render_params *params = &opts->params; + if (params->num_hooks && params->hooks != p->hooks.elem) { + PL_ARRAY_MEMDUP(opts, p->hooks, params->hooks, params->num_hooks); + params->hooks = p->hooks.elem; + } +} + +void pl_options_add_hook(pl_options opts, const struct pl_hook *hook) +{ + struct priv *p = PL_PRIV(opts); + make_hooks_internal(opts); + PL_ARRAY_APPEND(opts, p->hooks, hook); + opts->params.hooks = p->hooks.elem; +} + +void pl_options_insert_hook(pl_options opts, const struct pl_hook *hook, int idx) +{ + struct priv *p = PL_PRIV(opts); + make_hooks_internal(opts); + PL_ARRAY_INSERT_AT(opts, p->hooks, idx, hook); + opts->params.hooks = p->hooks.elem; +} + +void pl_options_remove_hook_at(pl_options opts, int idx) +{ + struct priv *p = PL_PRIV(opts); + make_hooks_internal(opts); + PL_ARRAY_REMOVE_AT(p->hooks, idx); + opts->params.hooks = p->hooks.elem; +} + +// Options printing/parsing context +typedef const struct opt_ctx_t { + pl_log log; // as a convenience, only needed when parsing + pl_opt opt; + void *alloc; // for printing only + pl_options opts; // current base ptr +} *opt_ctx; + +struct enum_val { + const char *name; + unsigned val; +}; + +struct preset { + const char *name; + const void *val; +}; + +struct named { + const char *name; +}; + +typedef const struct opt_priv_t { + int (*compare)(opt_ctx p, const void *a, const void *b); // optional + void (*print)(opt_ctx p, pl_str *out, const void *val); // apends to `out` + bool (*parse)(opt_ctx p, pl_str str, void *out_val); + const struct enum_val *values; // for enums, terminated by {0} + const struct preset *presets; // for preset lists, terminated by {0} + const struct named * const *names; // for array-backed options, terminated by NULL + + // Offset and size of option in `struct pl_options_t` + size_t offset; + size_t size; + size_t offset_params; // offset of actual struct (for params toggles) +} *opt_priv; + +static pl_opt_data get_opt_data(opt_ctx ctx) +{ + pl_options opts = ctx->opts; + struct priv *p = PL_PRIV(opts); + opt_priv priv = ctx->opt->priv; + const void *val = (void *) ((uintptr_t) opts + priv->offset); + + p->data_text.len = 0; + priv->print(ctx, &p->data_text, val); + p->data = (struct pl_opt_data_t) { + .opts = opts, + .opt = ctx->opt, + .value = val, + .text = (char *) p->data_text.buf, + }; + + return &p->data; +} + +pl_opt_data pl_options_get(pl_options opts, const char *key) +{ + struct priv *p = PL_PRIV(opts); + + pl_opt opt = pl_find_option(key); + if (!opt || opt->preset) { + PL_ERR(p, "Unrecognized or invalid option '%s'", key); + return NULL; + } + + return get_opt_data(&(struct opt_ctx_t) { + .alloc = opts, + .opts = opts, + .opt = opt, + }); +} + +void pl_options_iterate(pl_options opts, + void (*cb)(void *priv, pl_opt_data data), + void *cb_priv) +{ + for (pl_opt opt = pl_option_list; opt->key; opt++) { + if (opt->preset) + continue; + + struct opt_ctx_t ctx = { + .alloc = opts, + .opts = opts, + .opt = opt, + }; + + opt_priv priv = opt->priv; + const void *val = (void *) ((uintptr_t) opts + priv->offset); + const void *ref = (void *) ((uintptr_t) &defaults + priv->offset); + int cmp = priv->compare ? priv->compare(&ctx, val, ref) + : memcmp(val, ref, priv->size); + if (cmp != 0) + cb(cb_priv, get_opt_data(&ctx)); + } +} + +static void save_cb(void *priv, pl_opt_data data) +{ + pl_opt opt = data->opt; + void *alloc = data->opts; + pl_str *out = priv; + + if (out->len) + pl_str_append_raw(alloc, out, ",", 1); + pl_str_append_raw(alloc, out, opt->key, strlen(opt->key)); + pl_str_append_raw(alloc, out, "=", 1); + pl_str_append(alloc, out, pl_str0(data->text)); +} + +const char *pl_options_save(pl_options opts) +{ + struct priv *p = PL_PRIV(opts); + + p->saved.len = 0; + pl_options_iterate(opts, save_cb, &p->saved); + return p->saved.len ? (char *) p->saved.buf : ""; +} + +static bool option_set_raw(pl_options opts, pl_str k, pl_str v) +{ + struct priv *p = PL_PRIV(opts); + k = pl_str_strip(k); + v = pl_str_strip(v); + + pl_opt opt; + for (opt = pl_option_list; opt->key; opt++) { + if (pl_str_equals0(k, opt->key)) + goto found; + } + + PL_ERR(p, "Unrecognized option '%.*s', in '%.*s=%.*s'", + PL_STR_FMT(k), PL_STR_FMT(k), PL_STR_FMT(v)); + return false; + +found: + PL_TRACE(p, "Parsing option '%s' = '%.*s'", opt->key, PL_STR_FMT(v)); + if (opt->deprecated) + PL_WARN(p, "Option '%s' is deprecated", opt->key); + + struct opt_ctx_t ctx = { + .log = p->log, + .opts = opts, + .opt = opt, + }; + + opt_priv priv = opt->priv; + void *val = (void *) ((uintptr_t) opts + priv->offset); + return priv->parse(&ctx, v, val); +} + +bool pl_options_set_str(pl_options opts, const char *key, const char *value) +{ + return option_set_raw(opts, pl_str0(key), pl_str0(value)); +} + +bool pl_options_load(pl_options opts, const char *str) +{ + bool ret = true; + + pl_str rest = pl_str0(str); + while (rest.len) { + pl_str kv = pl_str_strip(pl_str_split_chars(rest, " ,;:\n", &rest)); + if (!kv.len) + continue; + pl_str v, k = pl_str_split_char(kv, '=', &v); + ret &= option_set_raw(opts, k, v); + } + + return ret; +} + +// Individual option types + +static void print_bool(opt_ctx p, pl_str *out, const void *ptr) +{ + const bool *val = ptr; + if (*val) { + pl_str_append(p->alloc, out, pl_str0("yes")); + } else { + pl_str_append(p->alloc, out, pl_str0("no")); + } +} + +static bool parse_bool(opt_ctx p, pl_str str, void *out) +{ + bool *res = out; + if (pl_str_equals0(str, "yes") || + pl_str_equals0(str, "y") || + pl_str_equals0(str, "on") || + pl_str_equals0(str, "true") || + pl_str_equals0(str, "enabled") || + !str.len) // accept naked option name as well + { + *res = true; + return true; + } else if (pl_str_equals0(str, "no") || + pl_str_equals0(str, "n") || + pl_str_equals0(str, "off") || + pl_str_equals0(str, "false") || + pl_str_equals0(str, "disabled")) + { + *res = false; + return true; + } + + PL_ERR(p, "Invalid value '%.*s' for option '%s', expected boolean", + PL_STR_FMT(str), p->opt->key); + return false; +} + +static void print_int(opt_ctx p, pl_str *out, const void *ptr) +{ + pl_opt opt = p->opt; + const int *val = ptr; + pl_assert(opt->min == opt->max || (*val >= opt->min && *val <= opt->max)); + pl_str_append_asprintf_c(p->alloc, out, "%d", *val); +} + +static bool parse_int(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + int val; + if (!pl_str_parse_int(str, &val)) { + PL_ERR(p, "Invalid value '%.*s' for option '%s', expected integer", + PL_STR_FMT(str), opt->key); + return false; + } + + if (opt->min != opt->max) { + if (val < opt->min || val > opt->max) { + PL_ERR(p, "Value of %d out of range for option '%s': [%d, %d]", + val, opt->key, (int) opt->min, (int) opt->max); + return false; + } + } + + *(int *) out = val; + return true; +} + +static void print_float(opt_ctx p, pl_str *out, const void *ptr) +{ + pl_opt opt = p->opt; + const float *val = ptr; + pl_assert(opt->min == opt->max || (*val >= opt->min && *val <= opt->max)); + pl_str_append_asprintf_c(p->alloc, out, "%f", *val); +} + +static bool parse_fraction(pl_str str, float *val) +{ + pl_str denom, num = pl_str_split_char(str, '/', &denom); + float n, d; + bool ok = denom.buf && denom.len && pl_str_parse_float(num, &n) && + pl_str_parse_float(denom, &d); + if (ok) + *val = n / d; + return ok; +} + +static bool parse_float(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + float val; + if (!parse_fraction(str, &val) && !pl_str_parse_float(str, &val)) { + PL_ERR(p, "Invalid value '%.*s' for option '%s', expected floating point " + "or fraction", PL_STR_FMT(str), opt->key); + return false; + } + + switch (fpclassify(val)) { + case FP_NAN: + case FP_INFINITE: + case FP_SUBNORMAL: + PL_ERR(p, "Invalid value '%f' for option '%s', non-normal float", + val, opt->key); + return false; + + case FP_ZERO: + case FP_NORMAL: + break; + } + + if (opt->min != opt->max) { + if (val < opt->min || val > opt->max) { + PL_ERR(p, "Value of %.3f out of range for option '%s': [%.2f, %.2f]", + val, opt->key, opt->min, opt->max); + return false; + } + } + + *(float *) out = val; + return true; +} + +static int compare_params(opt_ctx p, const void *pa, const void *pb) +{ + const bool a = *(const void * const *) pa; + const bool b = *(const void * const *) pb; + return PL_CMP(a, b); +} + +static void print_params(opt_ctx p, pl_str *out, const void *ptr) +{ + const bool value = *(const void * const *) ptr; + print_bool(p, out, &value); +} + +static bool parse_params(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + opt_priv priv = opt->priv; + const void **res = out; + bool set; + if (!parse_bool(p, str, &set)) + return false; + if (set) { + *res = (const void *) ((uintptr_t) p->opts + priv->offset_params); + } else { + *res = NULL; + } + return true; +} + +static void print_enum(opt_ctx p, pl_str *out, const void *ptr) +{ + pl_opt opt = p->opt; + opt_priv priv = opt->priv; + const unsigned value = *(const unsigned *) ptr; + for (int i = 0; priv->values[i].name; i++) { + if (priv->values[i].val == value) { + pl_str_append(p->alloc, out, pl_str0(priv->values[i].name)); + return; + } + } + + pl_unreachable(); +} + +static bool parse_enum(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + opt_priv priv = opt->priv; + for (int i = 0; priv->values[i].name; i++) { + if (pl_str_equals0(str, priv->values[i].name)) { + *(unsigned *) out = priv->values[i].val; + return true; + } + } + + PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:", + PL_STR_FMT(str), opt->key); + for (int i = 0; priv->values[i].name; i++) + PL_ERR(p, " %s", priv->values[i].name); + return false; +} + +static bool parse_preset(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + opt_priv priv = opt->priv; + for (int i = 0; priv->presets[i].name; i++) { + if (pl_str_equals0(str, priv->presets[i].name)) { + if (priv->offset == offsetof(struct pl_options_t, params)) { + const struct pl_render_params *preset = priv->presets[i].val; + pl_assert(priv->size == sizeof(*preset)); + + // Redirect params structs into internal system after loading + struct pl_render_params *params = out, prev = *params; + *params = *preset; + redirect_params(p->opts); + + // Re-apply excluded options + params->lut = prev.lut; + params->hooks = prev.hooks; + params->num_hooks = prev.num_hooks; + params->info_callback = prev.info_callback; + params->info_priv = prev.info_priv; + } else { + memcpy(out, priv->presets[i].val, priv->size); + } + return true; + } + } + + PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:", + PL_STR_FMT(str), opt->key); + for (int i = 0; priv->presets[i].name; i++) + PL_ERR(p, " %s", priv->presets[i].name); + return false; +} + +static void print_named(opt_ctx p, pl_str *out, const void *ptr) +{ + const struct named *value = *(const struct named **) ptr; + if (value) { + pl_str_append(p->alloc, out, pl_str0(value->name)); + } else { + pl_str_append(p->alloc, out, pl_str0("none")); + } +} + +static bool parse_named(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + opt_priv priv = opt->priv; + const struct named **res = out; + if (pl_str_equals0(str, "none")) { + *res = NULL; + return true; + } + + for (int i = 0; priv->names[i]; i++) { + if (pl_str_equals0(str, priv->names[i]->name)) { + *res = priv->names[i]; + return true; + } + } + + PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:", + PL_STR_FMT(str), opt->key); + PL_ERR(p, " none"); + for (int i = 0; priv->names[i]; i++) + PL_ERR(p, " %s", priv->names[i]->name); + return false; +} + +static void print_scaler(opt_ctx p, pl_str *out, const void *ptr) +{ + const struct pl_filter_config *f = *(const struct pl_filter_config **) ptr; + if (f) { + pl_assert(f->name); // this is either a built-in scaler or ptr to custom + pl_str_append(p->alloc, out, pl_str0(f->name)); + } else { + pl_str_append(p->alloc, out, pl_str0("none")); + } +} + +static enum pl_filter_usage scaler_usage(pl_opt opt) +{ + opt_priv priv = opt->priv; + switch (priv->offset) { + case offsetof(struct pl_options_t, params.upscaler): + case offsetof(struct pl_options_t, params.plane_upscaler): + case offsetof(struct pl_options_t, upscaler): + case offsetof(struct pl_options_t, plane_upscaler): + return PL_FILTER_UPSCALING; + + case offsetof(struct pl_options_t, params.downscaler): + case offsetof(struct pl_options_t, params.plane_downscaler): + case offsetof(struct pl_options_t, downscaler): + case offsetof(struct pl_options_t, plane_downscaler): + return PL_FILTER_DOWNSCALING; + + case offsetof(struct pl_options_t, params.frame_mixer): + case offsetof(struct pl_options_t, frame_mixer): + return PL_FILTER_FRAME_MIXING; + } + + pl_unreachable(); +} + +static bool parse_scaler(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + opt_priv priv = opt->priv; + const struct pl_filter_config **res = out; + if (pl_str_equals0(str, "none")) { + *res = NULL; + return true; + } else if (pl_str_equals0(str, "custom")) { + *res = (void *) ((uintptr_t) p->opts + priv->offset_params); + return true; + } + + const enum pl_filter_usage usage = scaler_usage(opt); + for (int i = 0; i < pl_num_filter_configs; i++) { + if (!(pl_filter_configs[i]->allowed & usage)) + continue; + if (pl_str_equals0(str, pl_filter_configs[i]->name)) { + *res = pl_filter_configs[i]; + return true; + } + } + + PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:", + PL_STR_FMT(str), opt->key); + PL_ERR(p, " none"); + PL_ERR(p, " custom"); + for (int i = 0; i < pl_num_filter_configs; i++) { + if (pl_filter_configs[i]->allowed & usage) + PL_ERR(p, " %s", pl_filter_configs[i]->name); + } + return false; +} + +static bool parse_scaler_preset(opt_ctx p, pl_str str, void *out) +{ + pl_opt opt = p->opt; + struct pl_filter_config *res = out; + if (pl_str_equals0(str, "none")) { + *res = (struct pl_filter_config) { .name = "custom" }; + return true; + } + + const enum pl_filter_usage usage = scaler_usage(opt); + for (int i = 0; i < pl_num_filter_configs; i++) { + if (!(pl_filter_configs[i]->allowed & usage)) + continue; + if (pl_str_equals0(str, pl_filter_configs[i]->name)) { + copy_filter(res, pl_filter_configs[i]); + return true; + } + } + + PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:", + PL_STR_FMT(str), opt->key); + PL_ERR(p, " none"); + for (int i = 0; i < pl_num_filter_configs; i++) { + if (pl_filter_configs[i]->allowed & usage) + PL_ERR(p, " %s", pl_filter_configs[i]->name); + } + return false; +} + +#define OPT_BOOL(KEY, NAME, FIELD, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_BOOL, \ + .priv = &(struct opt_priv_t) { \ + .print = print_bool, \ + .parse = parse_bool, \ + .offset = offsetof(struct pl_options_t, FIELD), \ + .size = sizeof(struct { \ + bool dummy; \ + pl_static_assert(sizeof(defaults.FIELD) == sizeof(bool)); \ + }), \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_INT(KEY, NAME, FIELD, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_INT, \ + .priv = &(struct opt_priv_t) { \ + .print = print_int, \ + .parse = parse_int, \ + .offset = offsetof(struct pl_options_t, FIELD), \ + .size = sizeof(struct { \ + int dummy; \ + pl_static_assert(sizeof(defaults.FIELD) == sizeof(int)); \ + }), \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_FLOAT(KEY, NAME, FIELD, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_FLOAT, \ + .priv = &(struct opt_priv_t) { \ + .print = print_float, \ + .parse = parse_float, \ + .offset = offsetof(struct pl_options_t, FIELD), \ + .size = sizeof(struct { \ + float dummy; \ + pl_static_assert(sizeof(defaults.FIELD) == sizeof(float)); \ + }), \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_ENABLE_PARAMS(KEY, NAME, PARAMS, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_BOOL, \ + .priv = &(struct opt_priv_t) { \ + .compare = compare_params, \ + .print = print_params, \ + .parse = parse_params, \ + .offset = offsetof(struct pl_options_t, params.PARAMS), \ + .offset_params = offsetof(struct pl_options_t, PARAMS), \ + .size = sizeof(struct { \ + void *dummy; \ + pl_static_assert(sizeof(defaults.params.PARAMS) == sizeof(void*));\ + }), \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_ENUM(KEY, NAME, FIELD, VALUES, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_STRING, \ + .priv = &(struct opt_priv_t) { \ + .print = print_enum, \ + .parse = parse_enum, \ + .offset = offsetof(struct pl_options_t, FIELD), \ + .size = sizeof(struct { \ + unsigned dummy; \ + pl_static_assert(sizeof(defaults.FIELD) == sizeof(unsigned)); \ + }), \ + .values = (struct enum_val[]) { VALUES } \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_PRESET(KEY, NAME, PARAMS, PRESETS, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_STRING, \ + .preset = true, \ + .priv = &(struct opt_priv_t) { \ + .parse = parse_preset, \ + .offset = offsetof(struct pl_options_t, PARAMS), \ + .size = sizeof(defaults.PARAMS), \ + .presets = (struct preset[]) { PRESETS }, \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_NAMED(KEY, NAME, FIELD, NAMES, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_STRING, \ + .priv = &(struct opt_priv_t) { \ + .print = print_named, \ + .parse = parse_named, \ + .offset = offsetof(struct pl_options_t, FIELD), \ + .names = (const struct named * const * ) NAMES, \ + .size = sizeof(struct { \ + const struct named *dummy; \ + pl_static_assert(offsetof(__typeof__(*NAMES[0]), name) == 0); \ + pl_static_assert(sizeof(defaults.FIELD) == \ + sizeof(const struct named *)); \ + }), \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_SCALER(KEY, NAME, SCALER, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_STRING, \ + .priv = &(struct opt_priv_t) { \ + .print = print_scaler, \ + .parse = parse_scaler, \ + .offset = offsetof(struct pl_options_t, params.SCALER), \ + .offset_params = offsetof(struct pl_options_t, SCALER), \ + .size = sizeof(struct { \ + const struct pl_filter_config *dummy; \ + pl_static_assert(sizeof(defaults.SCALER) == \ + sizeof(struct pl_filter_config)); \ + }), \ + }, \ + __VA_ARGS__ \ + } + +#define OPT_SCALER_PRESET(KEY, NAME, SCALER, ...) \ + { \ + .key = KEY, \ + .name = NAME, \ + .type = PL_OPT_STRING, \ + .preset = true, \ + .priv = &(struct opt_priv_t) { \ + .parse = parse_scaler_preset, \ + .offset = offsetof(struct pl_options_t, SCALER), \ + .size = sizeof(struct { \ + struct pl_filter_config dummy; \ + pl_static_assert(sizeof(defaults.SCALER) == \ + sizeof(struct pl_filter_config)); \ + }), \ + }, \ + __VA_ARGS__ \ + } + +#define LIST(...) __VA_ARGS__, {0} + +#define SCALE_OPTS(PREFIX, NAME, FIELD) \ + OPT_SCALER(PREFIX, NAME, FIELD), \ + OPT_SCALER_PRESET(PREFIX"_preset", NAME "preset", FIELD), \ + OPT_NAMED(PREFIX"_kernel", NAME" kernel", FIELD.kernel, pl_filter_functions), \ + OPT_NAMED(PREFIX"_window", NAME" window", FIELD.window, pl_filter_functions), \ + OPT_FLOAT(PREFIX"_radius", NAME" radius", FIELD.radius, .min = 0.0, .max = 16.0), \ + OPT_FLOAT(PREFIX"_clamp", NAME" clamping", FIELD.clamp, .max = 1.0), \ + OPT_FLOAT(PREFIX"_blur", NAME" blur factor", FIELD.blur, .max = 100.0), \ + OPT_FLOAT(PREFIX"_taper", NAME" taper factor", FIELD.taper, .max = 1.0), \ + OPT_FLOAT(PREFIX"_antiring", NAME" antiringing", FIELD.antiring, .max = 1.0), \ + OPT_FLOAT(PREFIX"_param1", NAME" parameter 1", FIELD.params[0]), \ + OPT_FLOAT(PREFIX"_param2", NAME" parameter 2", FIELD.params[1]), \ + OPT_FLOAT(PREFIX"_wparam1", NAME" window parameter 1", FIELD.wparams[0]), \ + OPT_FLOAT(PREFIX"_wparam2", NAME" window parameter 2", FIELD.wparams[1]), \ + OPT_BOOL(PREFIX"_polar", NAME" polar", FIELD.polar) + +const struct pl_opt_t pl_option_list[] = { + OPT_PRESET("preset", "Global preset", params, LIST( + {"default", &pl_render_default_params}, + {"fast", &pl_render_fast_params}, + {"high_quality", &pl_render_high_quality_params})), + + // Scalers + SCALE_OPTS("upscaler", "Upscaler", upscaler), + SCALE_OPTS("downscaler", "Downscaler", downscaler), + SCALE_OPTS("plane_upscaler", "Plane upscaler", plane_upscaler), + SCALE_OPTS("plane_downscaler", "Plane downscaler", plane_downscaler), + SCALE_OPTS("frame_mixer", "Frame mixer", frame_mixer), + OPT_FLOAT("antiringing_strength", "Anti-ringing strength", params.antiringing_strength, .max = 1.0), + + // Debanding + OPT_ENABLE_PARAMS("deband", "Enable debanding", deband_params), + OPT_PRESET("deband_preset", "Debanding preset", deband_params, LIST( + {"default", &pl_deband_default_params})), + OPT_INT("deband_iterations", "Debanding iterations", deband_params.iterations, .max = 16), + OPT_FLOAT("deband_threshold", "Debanding threshold", deband_params.threshold, .max = 1000.0), + OPT_FLOAT("deband_radius", "Debanding radius", deband_params.radius, .max = 1000.0), + OPT_FLOAT("deband_grain", "Debanding grain", deband_params.grain, .max = 1000.0), + OPT_FLOAT("deband_grain_neutral_r", "Debanding grain neutral R", deband_params.grain_neutral[0]), + OPT_FLOAT("deband_grain_neutral_g", "Debanding grain neutral G", deband_params.grain_neutral[1]), + OPT_FLOAT("deband_grain_neutral_b", "Debanding grain neutral B", deband_params.grain_neutral[2]), + + // Sigmodization + OPT_ENABLE_PARAMS("sigmoid", "Enable sigmoidization", sigmoid_params), + OPT_PRESET("sigmoid_preset", "Sigmoidization preset", sigmoid_params, LIST( + {"default", &pl_sigmoid_default_params})), + OPT_FLOAT("sigmoid_center", "Sigmoidization center", sigmoid_params.center, .max = 1.0), + OPT_FLOAT("sigmoid_slope", "Sigmoidization slope", sigmoid_params.slope, .min = 1.0, .max = 20.0), + + // Color adjustment + OPT_ENABLE_PARAMS("color_adjustment", "Enable color adjustment", color_adjustment), + OPT_PRESET("color_adjustment_preset", "Color adjustment preset", color_adjustment, LIST( + {"neutral", &pl_color_adjustment_neutral})), + OPT_FLOAT("brightness", "Brightness boost", color_adjustment.brightness, .min = -1.0, .max = 1.0), + OPT_FLOAT("contrast", "Contrast boost", color_adjustment.contrast, .max = 100.0), + OPT_FLOAT("saturation", "Saturation gain", color_adjustment.saturation, .max = 100.0), + OPT_FLOAT("hue", "Hue shift", color_adjustment.hue), + OPT_FLOAT("gamma", "Gamma adjustment", color_adjustment.gamma, .max = 100.0), + OPT_FLOAT("temperature", "Color temperature shift", color_adjustment.temperature, + .min = (2500 - 6500) / 3500.0, // see `pl_white_from_temp` + .max = (25000 - 6500) / 3500.0), + + // Peak detection + OPT_ENABLE_PARAMS("peak_detect", "Enable peak detection", peak_detect_params), + OPT_PRESET("peak_detect_preset", "Peak detection preset", peak_detect_params, LIST( + {"default", &pl_peak_detect_default_params}, + {"high_quality", &pl_peak_detect_high_quality_params})), + OPT_FLOAT("peak_smoothing_period", "Peak detection smoothing coefficient", peak_detect_params.smoothing_period, .max = 1000.0), + OPT_FLOAT("scene_threshold_low", "Scene change threshold low", peak_detect_params.scene_threshold_low, .max = 100.0), + OPT_FLOAT("scene_threshold_high", "Scene change threshold high", peak_detect_params.scene_threshold_high, .max = 100.0), + OPT_FLOAT("minimum_peak", "Minimum detected peak", peak_detect_params.minimum_peak, .max = 100.0, .deprecated = true), + OPT_FLOAT("peak_percentile", "Peak detection percentile", peak_detect_params.percentile, .max = 100.0), + OPT_BOOL("allow_delayed_peak", "Allow delayed peak detection", peak_detect_params.allow_delayed), + + // Color mapping + OPT_ENABLE_PARAMS("color_map", "Enable color mapping", color_map_params), + OPT_PRESET("color_map_preset", "Color mapping preset", color_map_params, LIST( + {"default", &pl_color_map_default_params}, + {"high_quality", &pl_color_map_high_quality_params})), + OPT_NAMED("gamut_mapping", "Gamut mapping function", color_map_params.gamut_mapping, + pl_gamut_map_functions), + OPT_FLOAT("perceptual_deadzone", "Gamut mapping perceptual deadzone", color_map_params.gamut_constants.perceptual_deadzone, .max = 1.0f), + OPT_FLOAT("perceptual_strength", "Gamut mapping perceptual strength", color_map_params.gamut_constants.perceptual_strength, .max = 1.0f), + OPT_FLOAT("colorimetric_gamma", "Gamut mapping colorimetric gamma", color_map_params.gamut_constants.colorimetric_gamma, .max = 10.0f), + OPT_FLOAT("softclip_knee", "Gamut mapping softclip knee point", color_map_params.gamut_constants.softclip_knee, .max = 1.0f), + OPT_FLOAT("softclip_desat", "Gamut mapping softclip desaturation strength", color_map_params.gamut_constants.softclip_desat, .max = 1.0f), + OPT_INT("lut3d_size_I", "Gamut 3DLUT size I", color_map_params.lut3d_size[0], .max = 1024), + OPT_INT("lut3d_size_C", "Gamut 3DLUT size C", color_map_params.lut3d_size[1], .max = 1024), + OPT_INT("lut3d_size_h", "Gamut 3DLUT size h", color_map_params.lut3d_size[2], .max = 1024), + OPT_BOOL("lut3d_tricubic", "Gamut 3DLUT tricubic interpolation", color_map_params.lut3d_tricubic), + OPT_BOOL("gamut_expansion", "Gamut expansion", color_map_params.gamut_expansion), + OPT_NAMED("tone_mapping", "Tone mapping function", color_map_params.tone_mapping_function, + pl_tone_map_functions), + OPT_FLOAT("knee_adaptation", "Tone mapping knee point adaptation", color_map_params.tone_constants.knee_adaptation, .max = 1.0f), + OPT_FLOAT("knee_minimum", "Tone mapping knee point minimum", color_map_params.tone_constants.knee_minimum, .max = 0.5f), + OPT_FLOAT("knee_maximum", "Tone mapping knee point maximum", color_map_params.tone_constants.knee_maximum, .min = 0.5f, .max = 1.0f), + OPT_FLOAT("knee_default", "Tone mapping knee point default", color_map_params.tone_constants.knee_default, .max = 1.0f), + OPT_FLOAT("knee_offset", "BT.2390 knee point offset", color_map_params.tone_constants.knee_offset, .min = 0.5f, .max = 2.0f), + OPT_FLOAT("slope_tuning", "Spline slope tuning strength", color_map_params.tone_constants.slope_tuning, .max = 10.0f), + OPT_FLOAT("slope_offset", "Spline slope tuning offset", color_map_params.tone_constants.slope_offset, .max = 1.0f), + OPT_FLOAT("spline_contrast", "Spline slope contrast", color_map_params.tone_constants.spline_contrast, .max = 1.5f), + OPT_FLOAT("reinhard_contrast", "Reinhard contrast", color_map_params.tone_constants.reinhard_contrast, .max = 1.0f), + OPT_FLOAT("linear_knee", "Tone mapping linear knee point", color_map_params.tone_constants.linear_knee, .max = 1.0f), + OPT_FLOAT("exposure", "Tone mapping linear exposure", color_map_params.tone_constants.exposure, .max = 10.0f), + OPT_BOOL("inverse_tone_mapping", "Inverse tone mapping", color_map_params.inverse_tone_mapping), + OPT_ENUM("tone_map_metadata", "Source of HDR metadata to use", color_map_params.metadata, LIST( + {"any", PL_HDR_METADATA_ANY}, + {"none", PL_HDR_METADATA_NONE}, + {"hdr10", PL_HDR_METADATA_HDR10}, + {"hdr10plus", PL_HDR_METADATA_HDR10PLUS}, + {"cie_y", PL_HDR_METADATA_CIE_Y})), + OPT_INT("tone_lut_size", "Tone mapping LUT size", color_map_params.lut_size, .max = 4096), + OPT_FLOAT("contrast_recovery", "HDR contrast recovery strength", color_map_params.contrast_recovery, .max = 2.0), + OPT_FLOAT("contrast_smoothness", "HDR contrast recovery smoothness", color_map_params.contrast_smoothness, .min = 1.0, .max = 32.0), + OPT_BOOL("force_tone_mapping_lut", "Force tone mapping LUT", color_map_params.force_tone_mapping_lut), + OPT_BOOL("visualize_lut", "Visualize tone mapping LUTs", color_map_params.visualize_lut), + OPT_FLOAT("visualize_lut_x0", "Visualization rect x0", color_map_params.visualize_rect.x0), + OPT_FLOAT("visualize_lut_y0", "Visualization rect y0", color_map_params.visualize_rect.y0), + OPT_FLOAT("visualize_lut_x1", "Visualization rect x0", color_map_params.visualize_rect.x1), + OPT_FLOAT("visualize_lut_y1", "Visualization rect y0", color_map_params.visualize_rect.y1), + OPT_FLOAT("visualize_hue", "Visualization hue slice", color_map_params.visualize_hue), + OPT_FLOAT("visualize_theta", "Visualization rotation", color_map_params.visualize_theta), + OPT_BOOL("show_clipping", "Highlight clipped pixels", color_map_params.show_clipping), + OPT_FLOAT("tone_mapping_param", "Tone mapping function parameter", color_map_params.tone_mapping_param, .deprecated = true), + + // Dithering + OPT_ENABLE_PARAMS("dither", "Enable dithering", dither_params), + OPT_PRESET("dither_preset", "Dithering preset", dither_params, LIST( + {"default", &pl_dither_default_params})), + OPT_ENUM("dither_method", "Dither method", dither_params.method, LIST( + {"blue", PL_DITHER_BLUE_NOISE}, + {"ordered_lut", PL_DITHER_ORDERED_LUT}, + {"ordered", PL_DITHER_ORDERED_FIXED}, + {"white", PL_DITHER_WHITE_NOISE})), + OPT_INT("dither_lut_size", "Dither LUT size", dither_params.lut_size, .min = 1, .max = 8), + OPT_BOOL("dither_temporal", "Temporal dithering", dither_params.temporal), + + // ICC + OPT_ENABLE_PARAMS("icc", "Enable ICC settings", icc_params, .deprecated = true), + OPT_PRESET("icc_preset", "ICC preset", icc_params, LIST( + {"default", &pl_icc_default_params}), .deprecated = true), + OPT_ENUM("icc_intent", "ICC rendering intent", icc_params.intent, LIST( + {"auto", PL_INTENT_AUTO}, + {"perceptual", PL_INTENT_PERCEPTUAL}, + {"relative", PL_INTENT_RELATIVE_COLORIMETRIC}, + {"saturation", PL_INTENT_SATURATION}, + {"absolute", PL_INTENT_ABSOLUTE_COLORIMETRIC}), .deprecated = true), + OPT_INT("icc_size_r", "ICC 3DLUT size R", icc_params.size_r, .max = 256, .deprecated = true), + OPT_INT("icc_size_g", "ICC 3DLUT size G", icc_params.size_g, .max = 256, .deprecated = true), + OPT_INT("icc_size_b", "ICC 3DLUT size G", icc_params.size_b, .max = 256, .deprecated = true), + OPT_FLOAT("icc_max_luma", "ICC profile luma override", icc_params.max_luma, .max = 10000, .deprecated = true), + OPT_BOOL("icc_force_bpc", "Force ICC black point compensation", icc_params.force_bpc, .deprecated = true), + + // Cone distortion + OPT_ENABLE_PARAMS("cone", "Enable cone distortion", cone_params), + OPT_PRESET("cone_preset", "Cone distortion preset", cone_params, LIST( + {"normal", &pl_vision_normal}, + {"protanomaly", &pl_vision_protanomaly}, + {"protanopia", &pl_vision_protanopia}, + {"deuteranomaly", &pl_vision_deuteranomaly}, + {"deuteranopia", &pl_vision_deuteranopia}, + {"tritanomaly", &pl_vision_tritanomaly}, + {"tritanopia", &pl_vision_tritanopia}, + {"monochromacy", &pl_vision_monochromacy}, + {"achromatopsia", &pl_vision_achromatopsia})), + OPT_ENUM("cones", "Cone selection", cone_params.cones, LIST( + {"none", PL_CONE_NONE}, + {"l", PL_CONE_L}, + {"m", PL_CONE_M}, + {"s", PL_CONE_S}, + {"lm", PL_CONE_LM}, + {"ms", PL_CONE_MS}, + {"ls", PL_CONE_LS}, + {"lms", PL_CONE_LMS})), + OPT_FLOAT("cone_strength", "Cone distortion gain", cone_params.strength), + + // Blending +#define BLEND_VALUES LIST( \ + {"zero", PL_BLEND_ZERO}, \ + {"one", PL_BLEND_ONE}, \ + {"alpha", PL_BLEND_SRC_ALPHA}, \ + {"one_minus_alpha", PL_BLEND_ONE_MINUS_SRC_ALPHA}) + + OPT_ENABLE_PARAMS("blend", "Enable output blending", blend_params), + OPT_PRESET("blend_preset", "Output blending preset", blend_params, LIST( + {"alpha_overlay", &pl_alpha_overlay})), + OPT_ENUM("blend_src_rgb", "Source RGB blend mode", blend_params.src_rgb, BLEND_VALUES), + OPT_ENUM("blend_src_alpha", "Source alpha blend mode", blend_params.src_alpha, BLEND_VALUES), + OPT_ENUM("blend_dst_rgb", "Target RGB blend mode", blend_params.dst_rgb, BLEND_VALUES), + OPT_ENUM("blend_dst_alpha", "Target alpha blend mode", blend_params.dst_alpha, BLEND_VALUES), + + // Deinterlacing + OPT_ENABLE_PARAMS("deinterlace", "Enable deinterlacing", deinterlace_params), + OPT_PRESET("deinterlace_preset", "Deinterlacing preset", deinterlace_params, LIST( + {"default", &pl_deinterlace_default_params})), + OPT_ENUM("deinterlace_algo", "Deinterlacing algorithm", deinterlace_params.algo, LIST( + {"weave", PL_DEINTERLACE_WEAVE}, + {"bob", PL_DEINTERLACE_BOB}, + {"yadif", PL_DEINTERLACE_YADIF})), + OPT_BOOL("deinterlace_skip_spatial", "Skip spatial interlacing check", deinterlace_params.skip_spatial_check), + + // Distortion + OPT_ENABLE_PARAMS("distort", "Enable distortion", distort_params), + OPT_PRESET("distort_preset", "Distortion preset", distort_params, LIST( + {"default", &pl_distort_default_params})), + OPT_FLOAT("distort_scale_x", "Distortion X scale", distort_params.transform.mat.m[0][0]), + OPT_FLOAT("distort_scale_y", "Distortion Y scale", distort_params.transform.mat.m[1][1]), + OPT_FLOAT("distort_shear_x", "Distortion X shear", distort_params.transform.mat.m[0][1]), + OPT_FLOAT("distort_shear_y", "Distortion Y shear", distort_params.transform.mat.m[1][0]), + OPT_FLOAT("distort_offset_x", "Distortion X offset", distort_params.transform.c[0]), + OPT_FLOAT("distort_offset_y", "Distortion Y offset", distort_params.transform.c[1]), + OPT_BOOL("distort_unscaled", "Distortion unscaled", distort_params.unscaled), + OPT_BOOL("distort_constrain", "Constrain distortion", distort_params.constrain), + OPT_BOOL("distort_bicubic", "Distortion bicubic interpolation", distort_params.bicubic), + OPT_ENUM("distort_address_mode", "Distortion texture address mode", distort_params.address_mode, LIST( + {"clamp", PL_TEX_ADDRESS_CLAMP}, + {"repeat", PL_TEX_ADDRESS_REPEAT}, + {"mirror", PL_TEX_ADDRESS_MIRROR})), + OPT_ENUM("distort_alpha_mode", "Distortion alpha blending mode", distort_params.alpha_mode, LIST( + {"none", PL_ALPHA_UNKNOWN}, + {"independent", PL_ALPHA_INDEPENDENT}, + {"premultiplied", PL_ALPHA_PREMULTIPLIED})), + + // Misc renderer settings + OPT_NAMED("error_diffusion", "Error diffusion kernel", params.error_diffusion, + pl_error_diffusion_kernels), + OPT_ENUM("lut_type", "Color mapping LUT type", params.lut_type, LIST( + {"unknown", PL_LUT_UNKNOWN}, + {"native", PL_LUT_NATIVE}, + {"normalized", PL_LUT_NORMALIZED}, + {"conversion", PL_LUT_CONVERSION})), + OPT_FLOAT("background_r", "Background color R", params.background_color[0], .max = 1.0), + OPT_FLOAT("background_g", "Background color G", params.background_color[1], .max = 1.0), + OPT_FLOAT("background_b", "Background color B", params.background_color[2], .max = 1.0), + OPT_FLOAT("background_transparency", "Background color transparency", params.background_transparency, .max = 1), + OPT_BOOL("skip_target_clearing", "Skip target clearing", params.skip_target_clearing), + OPT_FLOAT("corner_rounding", "Corner rounding", params.corner_rounding, .max = 1.0), + OPT_BOOL("blend_against_tiles", "Blend against tiles", params.blend_against_tiles), + OPT_FLOAT("tile_color_hi_r", "Bright tile R", params.tile_colors[0][0], .max = 1.0), + OPT_FLOAT("tile_color_hi_g", "Bright tile G", params.tile_colors[0][1], .max = 1.0), + OPT_FLOAT("tile_color_hi_b", "Bright tile B", params.tile_colors[0][2], .max = 1.0), + OPT_FLOAT("tile_color_lo_r", "Dark tile R", params.tile_colors[1][0], .max = 1.0), + OPT_FLOAT("tile_color_lo_g", "Dark tile G", params.tile_colors[1][1], .max = 1.0), + OPT_FLOAT("tile_color_lo_b", "Dark tile B", params.tile_colors[1][2], .max = 1.0), + OPT_INT("tile_size", "Tile size", params.tile_size, .min = 2, .max = 256), + + // Performance / quality trade-offs and debugging options + OPT_BOOL("skip_anti_aliasing", "Skip anti-aliasing", params.skip_anti_aliasing), + OPT_INT("lut_entries", "Scaler LUT entries", params.lut_entries, .max = 256, .deprecated = true), + OPT_FLOAT("polar_cutoff", "Polar LUT cutoff", params.polar_cutoff, .max = 1.0, .deprecated = true), + OPT_BOOL("preserve_mixing_cache", "Preserve mixing cache", params.preserve_mixing_cache), + OPT_BOOL("skip_caching_single_frame", "Skip caching single frame", params.skip_caching_single_frame), + OPT_BOOL("disable_linear_scaling", "Disable linear scaling", params.disable_linear_scaling), + OPT_BOOL("disable_builtin_scalers", "Disable built-in scalers", params.disable_builtin_scalers), + OPT_BOOL("correct_subpixel_offset", "Correct subpixel offsets", params.correct_subpixel_offsets), + OPT_BOOL("ignore_icc_profiles", "Ignore ICC profiles", params.ignore_icc_profiles, .deprecated = true), + OPT_BOOL("force_dither", "Force-enable dithering", params.force_dither), + OPT_BOOL("disable_dither_gamma_correction", "Disable gamma-correct dithering", params.disable_dither_gamma_correction), + OPT_BOOL("disable_fbos", "Disable FBOs", params.disable_fbos), + OPT_BOOL("force_low_bit_depth_fbos", "Force 8-bit FBOs", params.force_low_bit_depth_fbos), + OPT_BOOL("dynamic_constants", "Dynamic constants", params.dynamic_constants), + {0}, +}; + +const int pl_option_count = PL_ARRAY_SIZE(pl_option_list) - 1; + +pl_opt pl_find_option(const char *key) +{ + for (int i = 0; i < pl_option_count; i++) { + if (!strcmp(key, pl_option_list[i].key)) + return &pl_option_list[i]; + } + + return NULL; +} diff --git a/src/os.h b/src/os.h new file mode 100644 index 0000000..386f0cb --- /dev/null +++ b/src/os.h @@ -0,0 +1,30 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#ifdef __unix__ +#define PL_HAVE_UNIX +#endif + +#ifdef _WIN32 +#define PL_HAVE_WIN32 +#endif + +#ifdef __APPLE__ +#define PL_HAVE_APPLE +#endif diff --git a/src/pl_alloc.c b/src/pl_alloc.c new file mode 100644 index 0000000..64eeda7 --- /dev/null +++ b/src/pl_alloc.c @@ -0,0 +1,313 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" + +struct header { +#ifndef NDEBUG +#define MAGIC 0x20210119LU + uint32_t magic; +#endif + size_t size; + struct header *parent; + struct ext *ext; + + // Pointer to actual data, for alignment purposes + max_align_t data[]; +}; + +// Lazily allocated, to save space for leaf allocations and allocations which +// don't need fancy requirements +struct ext { + size_t num_children; + size_t children_size; // total allocated size of `children` + struct header *children[]; +}; + +#define PTR_OFFSET offsetof(struct header, data) +#define MAX_ALLOC (SIZE_MAX - PTR_OFFSET) +#define MINIMUM_CHILDREN 4 + +static inline struct header *get_header(void *ptr) +{ + if (!ptr) + return NULL; + + struct header *hdr = (struct header *) ((uintptr_t) ptr - PTR_OFFSET); +#ifndef NDEBUG + assert(hdr->magic == MAGIC); +#endif + + return hdr; +} + +static inline void *oom(void) +{ + fprintf(stderr, "out of memory\n"); + abort(); +} + +static inline struct ext *alloc_ext(struct header *h) +{ + if (!h) + return NULL; + + if (!h->ext) { + h->ext = malloc(sizeof(struct ext) + MINIMUM_CHILDREN * sizeof(void *)); + if (!h->ext) + oom(); + h->ext->num_children = 0; + h->ext->children_size = MINIMUM_CHILDREN; + } + + return h->ext; +} + +static inline void attach_child(struct header *parent, struct header *child) +{ + child->parent = parent; + if (!parent) + return; + + + struct ext *ext = alloc_ext(parent); + if (ext->num_children == ext->children_size) { + size_t new_size = ext->children_size * 2; + ext = realloc(ext, sizeof(struct ext) + new_size * sizeof(void *)); + if (!ext) + oom(); + ext->children_size = new_size; + parent->ext = ext; + } + + ext->children[ext->num_children++] = child; +} + +static inline void unlink_child(struct header *parent, struct header *child) +{ + child->parent = NULL; + if (!parent) + return; + + struct ext *ext = parent->ext; + for (size_t i = 0; i < ext->num_children; i++) { + if (ext->children[i] == child) { + memmove(&ext->children[i], &ext->children[i + 1], + (--ext->num_children - i) * sizeof(ext->children[0])); + return; + } + } + + assert(!"unlinking orphaned child?"); +} + +void *pl_alloc(void *parent, size_t size) +{ + if (size >= MAX_ALLOC) + return oom(); + + struct header *h = malloc(PTR_OFFSET + size); + if (!h) + return oom(); + +#ifndef NDEBUG + h->magic = MAGIC; +#endif + h->size = size; + h->ext = NULL; + + attach_child(get_header(parent), h); + return h->data; +} + +void *pl_zalloc(void *parent, size_t size) +{ + if (size >= MAX_ALLOC) + return oom(); + + struct header *h = calloc(1, PTR_OFFSET + size); + if (!h) + return oom(); + +#ifndef NDEBUG + h->magic = MAGIC; +#endif + h->size = size; + + attach_child(get_header(parent), h); + return h->data; +} + +void *pl_realloc(void *parent, void *ptr, size_t size) +{ + if (size >= MAX_ALLOC) + return oom(); + if (!ptr) + return pl_alloc(parent, size); + + struct header *h = get_header(ptr); + assert(get_header(parent) == h->parent); + if (h->size == size) + return ptr; + + struct header *old_h = h; + h = realloc(h, PTR_OFFSET + size); + if (!h) + return oom(); + + h->size = size; + + if (h != old_h) { + if (h->parent) { + struct ext *ext = h->parent->ext; + for (size_t i = 0; i < ext->num_children; i++) { + if (ext->children[i] == old_h) { + ext->children[i] = h; + goto done_reparenting; + } + } + assert(!"reallocating orphaned child?"); + } +done_reparenting: + + if (h->ext) { + for (size_t i = 0; i < h->ext->num_children; i++) + h->ext->children[i]->parent = h; + } + } + + return h->data; +} + +void pl_free(void *ptr) +{ + struct header *h = get_header(ptr); + if (!h) + return; + + pl_free_children(ptr); + unlink_child(h->parent, h); + + free(h->ext); + free(h); +} + +void pl_free_children(void *ptr) +{ + struct header *h = get_header(ptr); + if (!h || !h->ext) + return; + +#ifndef NDEBUG + // this detects recursive hierarchies + h->magic = 0; +#endif + + for (size_t i = 0; i < h->ext->num_children; i++) { + h->ext->children[i]->parent = NULL; // prevent recursive access + pl_free(h->ext->children[i]->data); + } + h->ext->num_children = 0; + +#ifndef NDEBUG + h->magic = MAGIC; +#endif +} + +size_t pl_get_size(const void *ptr) +{ + const struct header *h = get_header((void *) ptr); + return h ? h->size : 0; +} + +void *pl_steal(void *parent, void *ptr) +{ + struct header *h = get_header(ptr); + if (!h) + return NULL; + + struct header *new_par = get_header(parent); + if (new_par != h->parent) { + unlink_child(h->parent, h); + attach_child(new_par, h); + } + + return h->data; +} + +void *pl_memdup(void *parent, const void *ptr, size_t size) +{ + if (!size) + return NULL; + + void *new = pl_alloc(parent, size); + if (!new) + return oom(); + + assert(ptr); + memcpy(new, ptr, size); + return new; +} + +char *pl_str0dup0(void *parent, const char *str) +{ + if (!str) + return NULL; + + return pl_memdup(parent, str, strlen(str) + 1); +} + +char *pl_strndup0(void *parent, const char *str, size_t size) +{ + if (!str) + return NULL; + + size_t str_size = strnlen(str, size); + char *new = pl_alloc(parent, str_size + 1); + if (!new) + return oom(); + memcpy(new, str, str_size); + new[str_size] = '\0'; + return new; +} + +char *pl_asprintf(void *parent, const char *fmt, ...) +{ + char *str; + va_list ap; + va_start(ap, fmt); + str = pl_vasprintf(parent, fmt, ap); + va_end(ap); + return str; +} + +char *pl_vasprintf(void *parent, const char *fmt, va_list ap) +{ + // First, we need to determine the size that will be required for + // printing the entire string. Do this by making a copy of the va_list + // and printing it to a null buffer. + va_list copy; + va_copy(copy, ap); + int size = vsnprintf(NULL, 0, fmt, copy); + va_end(copy); + if (size < 0) + return NULL; + + char *str = pl_alloc(parent, size + 1); + vsnprintf(str, size + 1, fmt, ap); + return str; +} diff --git a/src/pl_alloc.h b/src/pl_alloc.h new file mode 100644 index 0000000..78df08a --- /dev/null +++ b/src/pl_alloc.h @@ -0,0 +1,191 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdalign.h> +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +// Unlike standard malloc, `size` may be 0, in which case this returns an empty +// allocation which can still be used as a parent for other allocations. +void *pl_alloc(void *parent, size_t size); +void *pl_zalloc(void *parent, size_t size); +void *pl_realloc(void *parent, void *ptr, size_t size); + +static inline void *pl_calloc(void *parent, size_t count, size_t size) +{ + return pl_zalloc(parent, count * size); +} + +#define pl_tmp(parent) pl_alloc(parent, 0) + +// Variants of the above which resolve to sizeof(*ptr) +#define pl_alloc_ptr(parent, ptr) \ + (__typeof__(ptr)) pl_alloc(parent, sizeof(*(ptr))) +#define pl_zalloc_ptr(parent, ptr) \ + (__typeof__(ptr)) pl_zalloc(parent, sizeof(*(ptr))) +#define pl_calloc_ptr(parent, num, ptr) \ + (__typeof__(ptr)) pl_calloc(parent, num, sizeof(*(ptr))) + +// Helper function to allocate a struct and immediately assign it +#define pl_alloc_struct(parent, type, ...) \ + (type *) pl_memdup(parent, &(type) __VA_ARGS__, sizeof(type)) + +// Free an allocation and its children (recursively) +void pl_free(void *ptr); +void pl_free_children(void *ptr); + +#define pl_free_ptr(ptr) \ + do { \ + pl_free(*(ptr)); \ + *(ptr) = NULL; \ + } while (0) + +// Get the current size of an allocation. +size_t pl_get_size(const void *ptr); + +#define pl_grow(parent, ptr, size) \ + do { \ + size_t _size = (size); \ + if (_size > pl_get_size(*(ptr))) \ + *(ptr) = pl_realloc(parent, *(ptr), _size); \ + } while (0) + +// Reparent an allocation onto a new parent +void *pl_steal(void *parent, void *ptr); + +// Wrapper functions around common string utilities +void *pl_memdup(void *parent, const void *ptr, size_t size); +char *pl_str0dup0(void *parent, const char *str); +char *pl_strndup0(void *parent, const char *str, size_t size); + +#define pl_memdup_ptr(parent, ptr) \ + (__typeof__(ptr)) pl_memdup(parent, ptr, sizeof(*(ptr))) + +// Helper functions for allocating public/private pairs, done by allocating +// `priv` at the address of `pub` + sizeof(pub), rounded up to the maximum +// alignment requirements. + +#define PL_ALIGN_MEM(size) PL_ALIGN2(size, alignof(max_align_t)) + +#define PL_PRIV(pub) \ + (void *) ((uintptr_t) (pub) + PL_ALIGN_MEM(sizeof(*(pub)))) + +#define pl_alloc_obj(parent, ptr, priv) \ + (__typeof__(ptr)) pl_alloc(parent, PL_ALIGN_MEM(sizeof(*(ptr))) + sizeof(priv)) + +#define pl_zalloc_obj(parent, ptr, priv) \ + (__typeof__(ptr)) pl_zalloc(parent, PL_ALIGN_MEM(sizeof(*(ptr))) + sizeof(priv)) + +// Helper functions for dealing with arrays + +#define PL_ARRAY(type) struct { type *elem; int num; } + +#define PL_ARRAY_REALLOC(parent, arr, len) \ + do { \ + size_t _new_size = (len) * sizeof((arr).elem[0]); \ + (arr).elem = pl_realloc((void *) parent, (arr).elem, _new_size); \ + } while (0) + +#define PL_ARRAY_RESIZE(parent, arr, len) \ + do { \ + size_t _avail = pl_get_size((arr).elem) / sizeof((arr).elem[0]); \ + size_t _min_len = (len); \ + if (_avail < _min_len) \ + PL_ARRAY_REALLOC(parent, arr, _min_len); \ + } while (0) + +#define PL_ARRAY_MEMDUP(parent, arr, ptr, len) \ + do { \ + size_t _len = (len); \ + PL_ARRAY_RESIZE(parent, arr, _len); \ + memcpy((arr).elem, ptr, _len * sizeof((arr).elem[0])); \ + (arr).num = _len; \ + } while (0) + +#define PL_ARRAY_GROW(parent, arr) \ + do { \ + size_t _avail = pl_get_size((arr).elem) / sizeof((arr).elem[0]); \ + if (_avail < 10) { \ + PL_ARRAY_REALLOC(parent, arr, 10); \ + } else if ((arr).num == _avail) { \ + PL_ARRAY_REALLOC(parent, arr, (arr).num * 1.5); \ + } else { \ + assert((arr).elem); \ + } \ + } while (0) + +#define PL_ARRAY_APPEND(parent, arr, ...) \ + do { \ + PL_ARRAY_GROW(parent, arr); \ + (arr).elem[(arr).num++] = __VA_ARGS__; \ + } while (0) + +#define PL_ARRAY_CONCAT(parent, to, from) \ + do { \ + if ((from).num) { \ + PL_ARRAY_RESIZE(parent, to, (to).num + (from).num); \ + memmove(&(to).elem[(to).num], (from).elem, \ + (from).num * sizeof((from).elem[0])); \ + (to).num += (from).num; \ + } \ + } while (0) + +#define PL_ARRAY_REMOVE_RANGE(arr, idx, count) \ + do { \ + ptrdiff_t _idx = (idx); \ + if (_idx < 0) \ + _idx += (arr).num; \ + size_t _count = (count); \ + assert(_idx >= 0 && _idx + _count <= (arr).num); \ + memmove(&(arr).elem[_idx], &(arr).elem[_idx + _count], \ + ((arr).num - _idx - _count) * sizeof((arr).elem[0])); \ + (arr).num -= _count; \ + } while (0) + +#define PL_ARRAY_REMOVE_AT(arr, idx) PL_ARRAY_REMOVE_RANGE(arr, idx, 1) + +#define PL_ARRAY_INSERT_AT(parent, arr, idx, ...) \ + do { \ + ptrdiff_t _idx = (idx); \ + if (_idx < 0) \ + _idx += (arr).num + 1; \ + assert(_idx >= 0 && _idx <= (arr).num); \ + PL_ARRAY_GROW(parent, arr); \ + memmove(&(arr).elem[_idx + 1], &(arr).elem[_idx], \ + ((arr).num++ - _idx) * sizeof((arr).elem[0])); \ + (arr).elem[_idx] = __VA_ARGS__; \ + } while (0) + +// Returns whether or not there was any element to pop +#define PL_ARRAY_POP(arr, out) \ + ((arr).num > 0 \ + ? (*(out) = (arr).elem[--(arr).num], true) \ + : false \ + ) + +// Wrapper for dealing with non-PL_ARRAY arrays +#define PL_ARRAY_APPEND_RAW(parent, arr, idxvar, ...) \ + do { \ + PL_ARRAY(__typeof__((arr)[0])) _arr = { (arr), (idxvar) }; \ + PL_ARRAY_APPEND(parent, _arr, __VA_ARGS__); \ + (arr) = _arr.elem; \ + (idxvar) = _arr.num; \ + } while (0) diff --git a/src/pl_assert.h b/src/pl_assert.h new file mode 100644 index 0000000..b4c6656 --- /dev/null +++ b/src/pl_assert.h @@ -0,0 +1,37 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdio.h> +#include <assert.h> + +#ifndef NDEBUG +# define pl_assert assert +#else +# define pl_assert(expr) \ + do { \ + if (!(expr)) { \ + fprintf(stderr, "Assertion failed: %s in %s:%d\n", \ + #expr, __FILE__, __LINE__); \ + abort(); \ + } \ + } while (0) +#endif + +// In C11, static asserts must have a string message +#define pl_static_assert(expr) static_assert(expr, #expr) diff --git a/src/pl_clock.h b/src/pl_clock.h new file mode 100644 index 0000000..541ef0b --- /dev/null +++ b/src/pl_clock.h @@ -0,0 +1,98 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <time.h> +#include <stdint.h> + +#include "os.h" + +#ifdef PL_HAVE_WIN32 +# include <windows.h> +# define PL_CLOCK_QPC +#elif defined(PL_HAVE_APPLE) +# include <Availability.h> +# if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101200) || \ + (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 100000) || \ + (defined(__TV_OS_VERSION_MIN_REQUIRED) && __TV_OS_VERSION_MIN_REQUIRED < 100000) || \ + (defined(__WATCH_OS_VERSION_MIN_REQUIRED) && __WATCH_OS_VERSION_MIN_REQUIRED < 30000) || \ + !defined(CLOCK_MONOTONIC_RAW) +# include <mach/mach_time.h> +# define PL_CLOCK_MACH +# else +# define PL_CLOCK_MONOTONIC_RAW +# endif +#elif defined(CLOCK_MONOTONIC_RAW) +# define PL_CLOCK_MONOTONIC_RAW +#elif defined(TIME_UTC) +# define PL_CLOCK_TIMESPEC_GET +#else +# warning "pl_clock not implemented for this platform!" +#endif + +typedef uint64_t pl_clock_t; + +static inline pl_clock_t pl_clock_now(void) +{ +#if defined(PL_CLOCK_QPC) + + LARGE_INTEGER counter; + QueryPerformanceCounter(&counter); + return counter.QuadPart; + +#elif defined(PL_CLOCK_MACH) + + return mach_absolute_time(); + +#else + + struct timespec tp = { .tv_sec = 0, .tv_nsec = 0 }; +#if defined(PL_CLOCK_MONOTONIC_RAW) + clock_gettime(CLOCK_MONOTONIC_RAW, &tp); +#elif defined(PL_CLOCK_TIMESPEC_GET) + timespec_get(&tp, TIME_UTC); +#endif + return tp.tv_sec * UINT64_C(1000000000) + tp.tv_nsec; + +#endif +} + +static inline double pl_clock_diff(pl_clock_t a, pl_clock_t b) +{ + double frequency = 1e9; + +#if defined(PL_CLOCK_QPC) + + LARGE_INTEGER freq; + QueryPerformanceFrequency(&freq); + frequency = freq.QuadPart; + +#elif defined(PL_CLOCK_MACH) + + mach_timebase_info_data_t time_base; + if (mach_timebase_info(&time_base) != KERN_SUCCESS) + return 0; + frequency = (time_base.denom * 1e9) / time_base.numer; + +#endif + + if (b > a) + return (b - a) / -frequency; + else + return (a - b) / frequency; +} diff --git a/src/pl_string.c b/src/pl_string.c new file mode 100644 index 0000000..ba25971 --- /dev/null +++ b/src/pl_string.c @@ -0,0 +1,418 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "hash.h" + +static void grow_str(void *alloc, pl_str *str, size_t len) +{ + // Like pl_grow, but with some extra headroom + if (len > pl_get_size(str->buf)) + str->buf = pl_realloc(alloc, str->buf, len * 1.5); +} + +void pl_str_append(void *alloc, pl_str *str, pl_str append) +{ + // Also append an extra \0 for convenience, since a lot of the time + // this function will be used to generate a string buffer + grow_str(alloc, str, str->len + append.len + 1); + if (append.len) { + memcpy(str->buf + str->len, append.buf, append.len); + str->len += append.len; + } + str->buf[str->len] = '\0'; +} + +void pl_str_append_raw(void *alloc, pl_str *str, const void *ptr, size_t size) +{ + if (!size) + return; + grow_str(alloc, str, str->len + size); + memcpy(str->buf + str->len, ptr, size); + str->len += size; +} + +void pl_str_append_asprintf(void *alloc, pl_str *str, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + pl_str_append_vasprintf(alloc, str, fmt, ap); + va_end(ap); +} + +void pl_str_append_vasprintf(void *alloc, pl_str *str, const char *fmt, va_list ap) +{ + // First, we need to determine the size that will be required for + // printing the entire string. Do this by making a copy of the va_list + // and printing it to a null buffer. + va_list copy; + va_copy(copy, ap); + int size = vsnprintf(NULL, 0, fmt, copy); + va_end(copy); + if (size < 0) + return; + + // Make room in `str` and format to there directly + grow_str(alloc, str, str->len + size + 1); + str->len += vsnprintf((char *) (str->buf + str->len), size + 1, fmt, ap); +} + +int pl_str_sscanf(pl_str str, const char *fmt, ...) +{ + char *tmp = pl_strdup0(NULL, str); + va_list va; + va_start(va, fmt); + int ret = vsscanf(tmp, fmt, va); + va_end(va); + pl_free(tmp); + return ret; +} + +int pl_strchr(pl_str str, int c) +{ + if (!str.len) + return -1; + + void *pos = memchr(str.buf, c, str.len); + if (pos) + return (intptr_t) pos - (intptr_t) str.buf; + return -1; +} + +size_t pl_strspn(pl_str str, const char *accept) +{ + for (size_t i = 0; i < str.len; i++) { + if (!strchr(accept, str.buf[i])) + return i; + } + + return str.len; +} + +size_t pl_strcspn(pl_str str, const char *reject) +{ + for (size_t i = 0; i < str.len; i++) { + if (strchr(reject, str.buf[i])) + return i; + } + + return str.len; +} + +static inline bool pl_isspace(char c) +{ + switch (c) { + case ' ': + case '\n': + case '\r': + case '\t': + case '\v': + case '\f': + return true; + default: + return false; + } +} + +pl_str pl_str_strip(pl_str str) +{ + while (str.len && pl_isspace(str.buf[0])) { + str.buf++; + str.len--; + } + while (str.len && pl_isspace(str.buf[str.len - 1])) + str.len--; + return str; +} + +int pl_str_find(pl_str haystack, pl_str needle) +{ + if (!needle.len) + return 0; + + for (size_t i = 0; i + needle.len <= haystack.len; i++) { + if (memcmp(&haystack.buf[i], needle.buf, needle.len) == 0) + return i; + } + + return -1; +} + +pl_str pl_str_split_char(pl_str str, char sep, pl_str *out_rest) +{ + int pos = pl_strchr(str, sep); + if (pos < 0) { + if (out_rest) + *out_rest = (pl_str) {0}; + return str; + } else { + if (out_rest) + *out_rest = pl_str_drop(str, pos + 1); + return pl_str_take(str, pos); + } +} + +pl_str pl_str_split_chars(pl_str str, const char *seps, pl_str *out_rest) +{ + int pos = pl_strcspn(str, seps); + if (pos < 0) { + if (out_rest) + *out_rest = (pl_str) {0}; + return str; + } else { + if (out_rest) + *out_rest = pl_str_drop(str, pos + 1); + return pl_str_take(str, pos); + } +} + +pl_str pl_str_split_str(pl_str str, pl_str sep, pl_str *out_rest) +{ + int pos = pl_str_find(str, sep); + if (pos < 0) { + if (out_rest) + *out_rest = (pl_str) {0}; + return str; + } else { + if (out_rest) + *out_rest = pl_str_drop(str, pos + sep.len); + return pl_str_take(str, pos); + } +} + +static bool get_hexdigit(pl_str *str, int *digit) +{ + while (str->len && pl_isspace(str->buf[0])) { + str->buf++; + str->len--; + } + + if (!str->len) { + *digit = -1; // EOF + return true; + } + + char c = str->buf[0]; + str->buf++; + str->len--; + + if (c >= '0' && c <= '9') { + *digit = c - '0'; + } else if (c >= 'a' && c <= 'f') { + *digit = c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + *digit = c - 'A' + 10; + } else { + return false; // invalid char + } + + return true; +} + +bool pl_str_decode_hex(void *alloc, pl_str hex, pl_str *out) +{ + if (!out) + return false; + + uint8_t *buf = pl_alloc(alloc, hex.len / 2); + int len = 0; + + while (hex.len) { + int a, b; + if (!get_hexdigit(&hex, &a) || !get_hexdigit(&hex, &b)) + goto error; // invalid char + if (a < 0) // EOF + break; + if (b < 0) // only one digit + goto error; + + buf[len++] = (a << 4) | b; + } + + *out = (pl_str) { buf, len }; + return true; + +error: + pl_free(buf); + return false; +} + +struct pl_str_builder_t { + PL_ARRAY(pl_str_template) templates; + pl_str args; + pl_str output; +}; + +pl_str_builder pl_str_builder_alloc(void *alloc) +{ + pl_str_builder b = pl_zalloc_ptr(alloc, b); + return b; +} + +void pl_str_builder_free(pl_str_builder *b) +{ + if (*b) + pl_free_ptr(b); +} + +void pl_str_builder_reset(pl_str_builder b) +{ + *b = (struct pl_str_builder_t) { + .templates.elem = b->templates.elem, + .args.buf = b->args.buf, + .output.buf = b->output.buf, + }; +} + +uint64_t pl_str_builder_hash(const pl_str_builder b) +{ + size_t size = b->templates.num * sizeof(b->templates.elem[0]); + uint64_t hash = pl_mem_hash(b->templates.elem, size); + pl_hash_merge(&hash, pl_str_hash(b->args)); + return hash; +} + +pl_str pl_str_builder_exec(pl_str_builder b) +{ + pl_str args = b->args; + + b->output.len = 0; + for (int i = 0; i < b->templates.num; i++) { + size_t consumed = b->templates.elem[i](b, &b->output, args.buf); + pl_assert(consumed <= args.len); + args = pl_str_drop(args, consumed); + } + + // Terminate with an extra \0 byte for convenience + grow_str(b, &b->output, b->output.len + 1); + b->output.buf[b->output.len] = '\0'; + return b->output; +} + +void pl_str_builder_append(pl_str_builder b, pl_str_template tmpl, + const void *args, size_t size) +{ + PL_ARRAY_APPEND(b, b->templates, tmpl); + pl_str_append_raw(b, &b->args, args, size); +} + +void pl_str_builder_concat(pl_str_builder b, const pl_str_builder append) +{ + PL_ARRAY_CONCAT(b, b->templates, append->templates); + pl_str_append_raw(b, &b->args, append->args.buf, append->args.len); +} + +static size_t template_str_ptr(void *alloc, pl_str *buf, const uint8_t *args) +{ + const char *str; + memcpy(&str, args, sizeof(str)); + pl_str_append_raw(alloc, buf, str, strlen(str)); + return sizeof(str); +} + +void pl_str_builder_const_str(pl_str_builder b, const char *str) +{ + pl_str_builder_append(b, template_str_ptr, &str, sizeof(str)); +} + +static size_t template_str(void *alloc, pl_str *buf, const uint8_t *args) +{ + pl_str str; + memcpy(&str.len, args, sizeof(str.len)); + pl_str_append_raw(alloc, buf, args + sizeof(str.len), str.len); + return sizeof(str.len) + str.len; +} + +void pl_str_builder_str(pl_str_builder b, const pl_str str) +{ + pl_str_builder_append(b, template_str, &str.len, sizeof(str.len)); + pl_str_append_raw(b, &b->args, str.buf, str.len); +} + +void pl_str_builder_printf_c(pl_str_builder b, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + pl_str_builder_vprintf_c(b, fmt, ap); + va_end(ap); +} + +static size_t template_printf(void *alloc, pl_str *str, const uint8_t *args) +{ + const char *fmt; + memcpy(&fmt, args, sizeof(fmt)); + args += sizeof(fmt); + + return sizeof(fmt) + pl_str_append_memprintf_c(alloc, str, fmt, args); +} + +void pl_str_builder_vprintf_c(pl_str_builder b, const char *fmt, va_list ap) +{ + pl_str_builder_append(b, template_printf, &fmt, sizeof(fmt)); + + // Push all of the variadic arguments directly onto `b->args` + for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) { + c++; + switch (c[0]) { +#define WRITE(T, x) pl_str_append_raw(b, &b->args, &(T) {x}, sizeof(T)) + case '%': continue; + case 'c': WRITE(char, va_arg(ap, int)); break; + case 'd': WRITE(int, va_arg(ap, int)); break; + case 'u': WRITE(unsigned, va_arg(ap, unsigned)); break; + case 'f': WRITE(double, va_arg(ap, double)); break; + case 'h': + assert(c[1] == 'x'); + WRITE(unsigned short, va_arg(ap, unsigned)); + c++; + break; + case 'l': + assert(c[1] == 'l'); + switch (c[2]) { + case 'u': WRITE(long long unsigned, va_arg(ap, long long unsigned)); break; + case 'd': WRITE(long long int, va_arg(ap, long long int)); break; + default: abort(); + } + c += 2; + break; + case 'z': + assert(c[1] == 'u'); + WRITE(size_t, va_arg(ap, size_t)); + c++; + break; + case 's': { + pl_str str = pl_str0(va_arg(ap, const char *)); + pl_str_append(b, &b->args, str); + b->args.len++; // expand to include \0 byte (from pl_str_append) + break; + } + case '.': { + assert(c[1] == '*'); + assert(c[2] == 's'); + int len = va_arg(ap, int); + const char *str = va_arg(ap, const char *); + WRITE(int, len); + pl_str_append_raw(b, &b->args, str, len); + c += 2; + break; + } + default: + fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]); + abort(); + } +#undef WRITE + } +} diff --git a/src/pl_string.h b/src/pl_string.h new file mode 100644 index 0000000..7a0005c --- /dev/null +++ b/src/pl_string.h @@ -0,0 +1,318 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +PL_API_BEGIN + +typedef struct pl_str { + uint8_t *buf; + size_t len; +} pl_str; + +// For formatting with "%.*s" +#define PL_STR_FMT(str) (int)((str).len), ((str).buf ? (char *)((str).buf) : "") + +static inline pl_str pl_str0(const char *str) +{ + return (pl_str) { + .buf = (uint8_t *) str, + .len = str ? strlen(str) : 0, + }; +} + +// Macro version of pl_str0, for constants +#define PL_STR0(str) ((pl_str) { (uint8_t *) (str), (str) ? strlen(str) : 0 }) + +static inline pl_str pl_strdup(void *alloc, pl_str str) +{ + return (pl_str) { + .buf = (uint8_t *) (str.len ? pl_memdup(alloc, str.buf, str.len) : NULL), + .len = str.len, + }; +} + +// Always returns a valid string +static inline char *pl_strdup0(void *alloc, pl_str str) +{ + return pl_strndup0(alloc, str.len ? (char *) str.buf : "", str.len); +} + +// Adds a trailing \0 for convenience, even if `append` is an empty string +void pl_str_append(void *alloc, pl_str *str, pl_str append); + +// Like `pl_str_append` but for raw memory, omits trailing \0 +void pl_str_append_raw(void *alloc, pl_str *str, const void *ptr, size_t size); + +// Locale-sensitive string functions +char *pl_asprintf(void *parent, const char *fmt, ...) + PL_PRINTF(2, 3); +char *pl_vasprintf(void *parent, const char *fmt, va_list ap) + PL_PRINTF(2, 0); +void pl_str_append_asprintf(void *alloc, pl_str *str, const char *fmt, ...) + PL_PRINTF(3, 4); +void pl_str_append_vasprintf(void *alloc, pl_str *str, const char *fmt, va_list va) + PL_PRINTF(3, 0); +int pl_str_sscanf(pl_str str, const char *fmt, ...); + +// Locale-invariant versions of append_(v)asprintf +// +// NOTE: These only support a small handful of modifiers. Check `format.c` +// for a list. Calling them on an invalid string will abort! +void pl_str_append_asprintf_c(void *alloc, pl_str *str, const char *fmt, ...) + PL_PRINTF(3, 4); +void pl_str_append_vasprintf_c(void *alloc, pl_str *str, const char *fmt, va_list va) + PL_PRINTF(3, 0); + +// Variant of the above which takes arguments directly from a pointer in memory, +// reading them incrementally (tightly packed). Returns the amount of bytes +// read from `args`, as determined by the following table: +// +// %c: sizeof(char) +// %d, %u: sizeof(int) +// %f: sizeof(double) +// %lld, %llu: sizeof(long long int) +// %zu: sizeof(size_t) +// %s: \0 terminated string +// %.*s: sizeof(int) + that many bytes (no \0 terminator) +size_t pl_str_append_memprintf_c(void *alloc, pl_str *str, const char *fmt, + const void *args) + PL_PRINTF(3, 0); + +// Locale-invariant number printing +int pl_str_print_hex(char *buf, size_t len, unsigned short n); +int pl_str_print_int(char *buf, size_t len, int n); +int pl_str_print_uint(char *buf, size_t len, unsigned int n); +int pl_str_print_int64(char *buf, size_t len, int64_t n); +int pl_str_print_uint64(char *buf, size_t len, uint64_t n); +int pl_str_print_float(char *buf, size_t len, float n); +int pl_str_print_double(char *buf, size_t len, double n); + +// Locale-invariant number parsing +bool pl_str_parse_hex(pl_str str, unsigned short *out); +bool pl_str_parse_int(pl_str str, int *out); +bool pl_str_parse_uint(pl_str str, unsigned int *out); +bool pl_str_parse_int64(pl_str str, int64_t *out); +bool pl_str_parse_uint64(pl_str str, uint64_t *out); +bool pl_str_parse_float(pl_str str, float *out); +bool pl_str_parse_double(pl_str str, double *out); + +// Variants of string.h functions +int pl_strchr(pl_str str, int c); +size_t pl_strspn(pl_str str, const char *accept); +size_t pl_strcspn(pl_str str, const char *reject); + +// Strip leading/trailing whitespace +pl_str pl_str_strip(pl_str str); + +// Generic functions for cutting up strings +static inline pl_str pl_str_take(pl_str str, size_t len) +{ + if (len < str.len) + str.len = len; + return str; +} + +static inline pl_str pl_str_drop(pl_str str, size_t len) +{ + if (len >= str.len) + return (pl_str) { .buf = NULL, .len = 0 }; + + str.buf += len; + str.len -= len; + return str; +} + +// Find a substring in another string, and return its index (or -1) +int pl_str_find(pl_str haystack, pl_str needle); + +// String splitting functions. These return the part of the string before +// the separator, and optionally the rest (in `out_rest`). +// +// Note that the separator is not included as part of either string. +pl_str pl_str_split_char(pl_str str, char sep, pl_str *out_rest); +pl_str pl_str_split_str(pl_str str, pl_str sep, pl_str *out_rest); + +// Like `pl_str_split_char`, but splits on any char in `seps` +pl_str pl_str_split_chars(pl_str str, const char *seps, pl_str *out_rest); + +static inline pl_str pl_str_getline(pl_str str, pl_str *out_rest) +{ + return pl_str_split_char(str, '\n', out_rest); +} + +// Decode a string containing hexadecimal data. All whitespace will be silently +// ignored. When successful, this allocates a new array to store the output. +bool pl_str_decode_hex(void *alloc, pl_str hex, pl_str *out); + +static inline bool pl_str_equals(pl_str str1, pl_str str2) +{ + if (str1.len != str2.len) + return false; + if (str1.buf == str2.buf || !str1.len) + return true; + return memcmp(str1.buf, str2.buf, str1.len) == 0; +} + +static inline bool pl_str_startswith(pl_str str, pl_str prefix) +{ + if (!prefix.len) + return true; + if (str.len < prefix.len) + return false; + return memcmp(str.buf, prefix.buf, prefix.len) == 0; +} + +static inline bool pl_str_endswith(pl_str str, pl_str suffix) +{ + if (!suffix.len) + return true; + if (str.len < suffix.len) + return false; + return memcmp(str.buf + str.len - suffix.len, suffix.buf, suffix.len) == 0; +} + +static inline bool pl_str_eatstart(pl_str *str, pl_str prefix) +{ + if (!pl_str_startswith(*str, prefix)) + return false; + + str->buf += prefix.len; + str->len -= prefix.len; + return true; +} + +static inline bool pl_str_eatend(pl_str *str, pl_str suffix) +{ + if (!pl_str_endswith(*str, suffix)) + return false; + + str->len -= suffix.len; + return true; +} + +// Convenience wrappers for the above which save the use of a pl_str0 +static inline pl_str pl_str_split_str0(pl_str str, const char *sep, pl_str *out_rest) +{ + return pl_str_split_str(str, pl_str0(sep), out_rest); +} + +static inline bool pl_str_startswith0(pl_str str, const char *prefix) +{ + return pl_str_startswith(str, pl_str0(prefix)); +} + +static inline bool pl_str_endswith0(pl_str str, const char *suffix) +{ + return pl_str_endswith(str, pl_str0(suffix)); +} + +static inline bool pl_str_equals0(pl_str str1, const char *str2) +{ + return pl_str_equals(str1, pl_str0(str2)); +} + +static inline bool pl_str_eatstart0(pl_str *str, const char *prefix) +{ + return pl_str_eatstart(str, pl_str0(prefix)); +} + +static inline bool pl_str_eatend0(pl_str *str, const char *prefix) +{ + return pl_str_eatend(str, pl_str0(prefix)); +} + +// String building helpers, used to lazily construct a string by appending a +// series of string templates which can be executed on-demand into a final +// output buffer. +typedef struct pl_str_builder_t *pl_str_builder; + +// Returns the number of bytes consumed from `args`. Be warned that the pointer +// given will not necessarily be aligned to the type you need it as, so make +// sure to use `memcpy` or some other method of safely loading arbitrary data +// from memory. +typedef size_t (*pl_str_template)(void *alloc, pl_str *buf, const uint8_t *args); + +pl_str_builder pl_str_builder_alloc(void *alloc); +void pl_str_builder_free(pl_str_builder *builder); + +// Resets string builder without destroying buffer +void pl_str_builder_reset(pl_str_builder builder); + +// Returns a representative hash of the string builder's output, without +// actually executing it. Note that this is *not* the same as a pl_str_hash of +// the string builder's output. +// +// Note also that the output of this may not survive a process restart because +// of position-independent code and address randomization moving around the +// locatons of template functions, so special care must be taken not to +// compare such hashes across process invocations. +uint64_t pl_str_builder_hash(const pl_str_builder builder); + +// Executes a string builder, dispatching all templates. The resulting string +// is guaranteed to be \0-terminated, as a minor convenience. +// +// Calling any other `pl_str_builder_*` function on this builder causes the +// contents of the returned string to become undefined. +pl_str pl_str_builder_exec(pl_str_builder builder); + +// Append a template and its arguments to a string builder +void pl_str_builder_append(pl_str_builder builder, pl_str_template tmpl, + const void *args, size_t args_size); + +// Append an entire other `pl_str_builder` onto `builder` +void pl_str_builder_concat(pl_str_builder builder, const pl_str_builder append); + +// Append a constant string. This will only record &str into the buffer, which +// may have a number of unwanted consequences if the memory pointed at by +// `str` mutates at any point in time in the future, or if `str` is not +// at a stable location in memory. +// +// This is intended for strings which are compile-time constants. +void pl_str_builder_const_str(pl_str_builder builder, const char *str); + +// Append a string. This will make a full copy of `str` +void pl_str_builder_str(pl_str_builder builder, const pl_str str); +#define pl_str_builder_str0(b, str) pl_str_builder_str(b, pl_str0(str)) + +// Append a string printf-style. This will preprocess `fmt` to determine the +// number and type of arguments. Supports the same format conversion characters +// as `pl_str_append_asprintf_c`. +void pl_str_builder_printf_c(pl_str_builder builder, const char *fmt, ...) + PL_PRINTF(2, 3); + +void pl_str_builder_vprintf_c(pl_str_builder builder, const char *fmt, va_list ap) + PL_PRINTF(2, 0); + +// Helper macro to specialize `pl_str_builder_printf_c` to +// `pl_str_builder_const_str` if it contains no format characters. +#define pl_str_builder_addf(builder, ...) do \ +{ \ + if (_contains_fmt_chars(__VA_ARGS__)) { \ + pl_str_builder_printf_c(builder, __VA_ARGS__); \ + } else { \ + pl_str_builder_const_str(builder, _get_fmt(__VA_ARGS__)); \ + } \ +} while (0) + +// Helper macros to deal with the non-portability of __VA_OPT__(,) +#define _contains_fmt_chars(fmt, ...) (strchr(fmt, '%')) +#define _get_fmt(fmt, ...) fmt + +PL_API_END diff --git a/src/pl_thread.h b/src/pl_thread.h new file mode 100644 index 0000000..7a5ae47 --- /dev/null +++ b/src/pl_thread.h @@ -0,0 +1,73 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "os.h" + +enum pl_mutex_type { + PL_MUTEX_NORMAL = 0, + PL_MUTEX_RECURSIVE, +}; + +#define pl_mutex_init(mutex) \ + pl_mutex_init_type(mutex, PL_MUTEX_NORMAL) + +// Note: This is never compiled, and only documents the API. The actual +// implementations of these prototypes may be macros. +#ifdef PL_API_REFERENCE + +typedef void pl_mutex; +void pl_mutex_init_type(pl_mutex *mutex, enum pl_mutex_type mtype); +int pl_mutex_destroy(pl_mutex *mutex); +int pl_mutex_lock(pl_mutex *mutex); +int pl_mutex_unlock(pl_mutex *mutex); + +typedef void pl_cond; +int pl_cond_init(pl_cond *cond); +int pl_cond_destroy(pl_cond *cond); +int pl_cond_broadcast(pl_cond *cond); +int pl_cond_signal(pl_cond *cond); + +// `timeout` is in nanoseconds, or UINT64_MAX to block forever +int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout); +int pl_cond_wait(pl_cond *cond, pl_mutex *mutex); + +typedef void pl_static_mutex; +#define PL_STATIC_MUTEX_INITIALIZER +int pl_static_mutex_lock(pl_static_mutex *mutex); +int pl_static_mutex_unlock(pl_static_mutex *mutex); + +typedef void pl_thread; +#define PL_THREAD_VOID void +#define PL_THREAD_RETURN() return +int pl_thread_create(pl_thread *thread, PL_THREAD_VOID (*fun)(void *), void *arg); +int pl_thread_join(pl_thread thread); + +// Returns true if slept the full time, false otherwise +bool pl_thread_sleep(double t); + +#endif + +// Actual platform-specific implementation +#ifdef PL_HAVE_WIN32 +#include "pl_thread_win32.h" +#elif defined(PL_HAVE_PTHREAD) +#include "pl_thread_pthread.h" +#else +#error No threading implementation available! +#endif diff --git a/src/pl_thread_pthread.h b/src/pl_thread_pthread.h new file mode 100644 index 0000000..5910650 --- /dev/null +++ b/src/pl_thread_pthread.h @@ -0,0 +1,137 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <errno.h> +#include <pthread.h> +#include <sys/time.h> +#include <time.h> + +#include <pl_assert.h> + +typedef pthread_mutex_t pl_mutex; +typedef pthread_cond_t pl_cond; +typedef pthread_mutex_t pl_static_mutex; +typedef pthread_t pl_thread; +#define PL_STATIC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER + +static inline int pl_mutex_init_type_internal(pl_mutex *mutex, enum pl_mutex_type mtype) +{ + int mutex_type; + switch (mtype) { + case PL_MUTEX_RECURSIVE: + mutex_type = PTHREAD_MUTEX_RECURSIVE; + break; + case PL_MUTEX_NORMAL: + default: + #ifndef NDEBUG + mutex_type = PTHREAD_MUTEX_ERRORCHECK; + #else + mutex_type = PTHREAD_MUTEX_DEFAULT; + #endif + break; + } + + int ret = 0; + pthread_mutexattr_t attr; + ret = pthread_mutexattr_init(&attr); + if (ret != 0) + return ret; + + pthread_mutexattr_settype(&attr, mutex_type); + ret = pthread_mutex_init(mutex, &attr); + pthread_mutexattr_destroy(&attr); + return ret; +} + +#define pl_mutex_init_type(mutex, mtype) \ + pl_assert(!pl_mutex_init_type_internal(mutex, mtype)) + +#define pl_mutex_destroy pthread_mutex_destroy +#define pl_mutex_lock pthread_mutex_lock +#define pl_mutex_unlock pthread_mutex_unlock + +static inline int pl_cond_init(pl_cond *cond) +{ + int ret = 0; + pthread_condattr_t attr; + ret = pthread_condattr_init(&attr); + if (ret != 0) + return ret; + +#ifdef PTHREAD_HAS_SETCLOCK + pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); +#endif + ret = pthread_cond_init(cond, &attr); + pthread_condattr_destroy(&attr); + return ret; +} + +#define pl_cond_destroy pthread_cond_destroy +#define pl_cond_broadcast pthread_cond_broadcast +#define pl_cond_signal pthread_cond_signal +#define pl_cond_wait pthread_cond_wait + +static inline int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout) +{ + if (timeout == UINT64_MAX) + return pthread_cond_wait(cond, mutex); + + struct timespec ts; +#ifdef PTHREAD_HAS_SETCLOCK + if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) + return errno; +#else + struct timeval tv; + if (gettimeofday(&tv, NULL) < 0) // equivalent to CLOCK_REALTIME + return errno; + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * 1000; +#endif + + ts.tv_sec += timeout / 1000000000LLU; + ts.tv_nsec += timeout % 1000000000LLU; + + if (ts.tv_nsec > 1000000000L) { + ts.tv_nsec -= 1000000000L; + ts.tv_sec++; + } + + return pthread_cond_timedwait(cond, mutex, &ts); +} + +#define pl_static_mutex_lock pthread_mutex_lock +#define pl_static_mutex_unlock pthread_mutex_unlock + +#define PL_THREAD_VOID void * +#define PL_THREAD_RETURN() return NULL + +#define pl_thread_create(t, f, a) pthread_create(t, NULL, f, a) +#define pl_thread_join(t) pthread_join(t, NULL) + +static inline bool pl_thread_sleep(double t) +{ + if (t <= 0.0) + return true; + + struct timespec ts; + ts.tv_sec = (time_t) t; + ts.tv_nsec = (t - ts.tv_sec) * 1e9; + + return nanosleep(&ts, NULL) == 0; +} diff --git a/src/pl_thread_win32.h b/src/pl_thread_win32.h new file mode 100644 index 0000000..ef68d50 --- /dev/null +++ b/src/pl_thread_win32.h @@ -0,0 +1,182 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <windows.h> +#include <process.h> +#include <stdint.h> +#include <errno.h> + +#include <pl_assert.h> + +typedef CRITICAL_SECTION pl_mutex; +typedef CONDITION_VARIABLE pl_cond; + +static inline int pl_mutex_init_type_internal(pl_mutex *mutex, enum pl_mutex_type mtype) +{ + (void) mtype; + return !InitializeCriticalSectionEx(mutex, 0, 0); +} + +#define pl_mutex_init_type(mutex, mtype) \ + pl_assert(!pl_mutex_init_type_internal(mutex, mtype)) + +static inline int pl_mutex_destroy(pl_mutex *mutex) +{ + DeleteCriticalSection(mutex); + return 0; +} + +static inline int pl_mutex_lock(pl_mutex *mutex) +{ + EnterCriticalSection(mutex); + return 0; +} + +static inline int pl_mutex_unlock(pl_mutex *mutex) +{ + LeaveCriticalSection(mutex); + return 0; +} + +static inline int pl_cond_init(pl_cond *cond) +{ + InitializeConditionVariable(cond); + return 0; +} + +static inline int pl_cond_destroy(pl_cond *cond) +{ + // condition variables are not destroyed + (void) cond; + return 0; +} + +static inline int pl_cond_broadcast(pl_cond *cond) +{ + WakeAllConditionVariable(cond); + return 0; +} + +static inline int pl_cond_signal(pl_cond *cond) +{ + WakeConditionVariable(cond); + return 0; +} + +static inline int pl_cond_wait(pl_cond *cond, pl_mutex *mutex) +{ + return !SleepConditionVariableCS(cond, mutex, INFINITE); +} + +static inline int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout) +{ + if (timeout == UINT64_MAX) + return pl_cond_wait(cond, mutex); + + timeout /= UINT64_C(1000000); + if (timeout > INFINITE - 1) + timeout = INFINITE - 1; + + BOOL bRet = SleepConditionVariableCS(cond, mutex, timeout); + if (bRet == FALSE) + { + if (GetLastError() == ERROR_TIMEOUT) + return ETIMEDOUT; + else + return EINVAL; + } + return 0; +} + +typedef SRWLOCK pl_static_mutex; +#define PL_STATIC_MUTEX_INITIALIZER SRWLOCK_INIT + +static inline int pl_static_mutex_lock(pl_static_mutex *mutex) +{ + AcquireSRWLockExclusive(mutex); + return 0; +} + +static inline int pl_static_mutex_unlock(pl_static_mutex *mutex) +{ + ReleaseSRWLockExclusive(mutex); + return 0; +} + +typedef HANDLE pl_thread; +#define PL_THREAD_VOID unsigned __stdcall +#define PL_THREAD_RETURN() return 0 + +static inline int pl_thread_create(pl_thread *thread, + PL_THREAD_VOID (*fun)(void *), + void *__restrict arg) +{ + *thread = (HANDLE) _beginthreadex(NULL, 0, fun, arg, 0, NULL); + return *thread ? 0 : -1; +} + +static inline int pl_thread_join(pl_thread thread) +{ + DWORD ret = WaitForSingleObject(thread, INFINITE); + if (ret != WAIT_OBJECT_0) + return ret == WAIT_ABANDONED ? EINVAL : EDEADLK; + CloseHandle(thread); + return 0; +} + +static inline bool pl_thread_sleep(double t) +{ + // Time is expected in 100 nanosecond intervals. + // Negative values indicate relative time. + LARGE_INTEGER time = { .QuadPart = -(LONGLONG) (t * 1e7) }; + + if (time.QuadPart >= 0) + return true; + + bool ret = false; + +#ifndef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION +# define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION 0x2 +#endif + + HANDLE timer = CreateWaitableTimerEx(NULL, NULL, + CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, + TIMER_ALL_ACCESS); + + // CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is supported in Windows 10 1803+, + // retry without it. + if (!timer) + timer = CreateWaitableTimerEx(NULL, NULL, 0, TIMER_ALL_ACCESS); + + if (!timer) + goto end; + + if (!SetWaitableTimer(timer, &time, 0, NULL, NULL, 0)) + goto end; + + if (WaitForSingleObject(timer, INFINITE) != WAIT_OBJECT_0) + goto end; + + ret = true; + +end: + if (timer) + CloseHandle(timer); + return ret; +} diff --git a/src/renderer.c b/src/renderer.c new file mode 100644 index 0000000..cc56b6f --- /dev/null +++ b/src/renderer.c @@ -0,0 +1,3815 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" +#include "filters.h" +#include "hash.h" +#include "shaders.h" +#include "dispatch.h" + +#include <libplacebo/renderer.h> + +struct cached_frame { + uint64_t signature; + uint64_t params_hash; // for detecting `pl_render_params` changes + struct pl_color_space color; + struct pl_icc_profile profile; + pl_rect2df crop; + pl_tex tex; + int comps; + bool evict; // for garbage collection +}; + +struct sampler { + pl_shader_obj upscaler_state; + pl_shader_obj downscaler_state; +}; + +struct osd_vertex { + float pos[2]; + float coord[2]; + float color[4]; +}; + +struct icc_state { + pl_icc_object icc; + uint64_t error; // set to profile signature on failure +}; + +struct pl_renderer_t { + pl_gpu gpu; + pl_dispatch dp; + pl_log log; + + // Cached feature checks (inverted) + enum pl_render_error errors; + + // List containing signatures of disabled hooks + PL_ARRAY(uint64_t) disabled_hooks; + + // Shader resource objects and intermediate textures (FBOs) + pl_shader_obj tone_map_state; + pl_shader_obj dither_state; + pl_shader_obj grain_state[4]; + pl_shader_obj lut_state[3]; + pl_shader_obj icc_state[2]; + PL_ARRAY(pl_tex) fbos; + struct sampler sampler_main; + struct sampler sampler_contrast; + struct sampler samplers_src[4]; + struct sampler samplers_dst[4]; + + // Temporary storage for vertex/index data + PL_ARRAY(struct osd_vertex) osd_vertices; + PL_ARRAY(uint16_t) osd_indices; + struct pl_vertex_attrib osd_attribs[3]; + + // Frame cache (for frame mixing / interpolation) + PL_ARRAY(struct cached_frame) frames; + PL_ARRAY(pl_tex) frame_fbos; + + // For debugging / logging purposes + int prev_dither; + + // For backwards compatibility + struct icc_state icc_fallback[2]; +}; + +enum { + // Index into `lut_state` + LUT_IMAGE, + LUT_TARGET, + LUT_PARAMS, +}; + +enum { + // Index into `icc_state` + ICC_IMAGE, + ICC_TARGET +}; + +pl_renderer pl_renderer_create(pl_log log, pl_gpu gpu) +{ + pl_renderer rr = pl_alloc_ptr(NULL, rr); + *rr = (struct pl_renderer_t) { + .gpu = gpu, + .log = log, + .dp = pl_dispatch_create(log, gpu), + .osd_attribs = { + { + .name = "pos", + .offset = offsetof(struct osd_vertex, pos), + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2), + }, { + .name = "coord", + .offset = offsetof(struct osd_vertex, coord), + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2), + }, { + .name = "osd_color", + .offset = offsetof(struct osd_vertex, color), + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 4), + } + }, + }; + + assert(rr->dp); + return rr; +} + +static void sampler_destroy(pl_renderer rr, struct sampler *sampler) +{ + pl_shader_obj_destroy(&sampler->upscaler_state); + pl_shader_obj_destroy(&sampler->downscaler_state); +} + +void pl_renderer_destroy(pl_renderer *p_rr) +{ + pl_renderer rr = *p_rr; + if (!rr) + return; + + // Free all intermediate FBOs + for (int i = 0; i < rr->fbos.num; i++) + pl_tex_destroy(rr->gpu, &rr->fbos.elem[i]); + for (int i = 0; i < rr->frames.num; i++) + pl_tex_destroy(rr->gpu, &rr->frames.elem[i].tex); + for (int i = 0; i < rr->frame_fbos.num; i++) + pl_tex_destroy(rr->gpu, &rr->frame_fbos.elem[i]); + + // Free all shader resource objects + pl_shader_obj_destroy(&rr->tone_map_state); + pl_shader_obj_destroy(&rr->dither_state); + for (int i = 0; i < PL_ARRAY_SIZE(rr->lut_state); i++) + pl_shader_obj_destroy(&rr->lut_state[i]); + for (int i = 0; i < PL_ARRAY_SIZE(rr->grain_state); i++) + pl_shader_obj_destroy(&rr->grain_state[i]); + for (int i = 0; i < PL_ARRAY_SIZE(rr->icc_state); i++) + pl_shader_obj_destroy(&rr->icc_state[i]); + + // Free all samplers + sampler_destroy(rr, &rr->sampler_main); + sampler_destroy(rr, &rr->sampler_contrast); + for (int i = 0; i < PL_ARRAY_SIZE(rr->samplers_src); i++) + sampler_destroy(rr, &rr->samplers_src[i]); + for (int i = 0; i < PL_ARRAY_SIZE(rr->samplers_dst); i++) + sampler_destroy(rr, &rr->samplers_dst[i]); + + // Free fallback ICC profiles + for (int i = 0; i < PL_ARRAY_SIZE(rr->icc_fallback); i++) + pl_icc_close(&rr->icc_fallback[i].icc); + + pl_dispatch_destroy(&rr->dp); + pl_free_ptr(p_rr); +} + +size_t pl_renderer_save(pl_renderer rr, uint8_t *out) +{ + return pl_cache_save(pl_gpu_cache(rr->gpu), out, out ? SIZE_MAX : 0); +} + +void pl_renderer_load(pl_renderer rr, const uint8_t *cache) +{ + pl_cache_load(pl_gpu_cache(rr->gpu), cache, SIZE_MAX); +} + +void pl_renderer_flush_cache(pl_renderer rr) +{ + for (int i = 0; i < rr->frames.num; i++) + pl_tex_destroy(rr->gpu, &rr->frames.elem[i].tex); + rr->frames.num = 0; + + pl_reset_detected_peak(rr->tone_map_state); +} + +const struct pl_render_params pl_render_fast_params = { PL_RENDER_DEFAULTS }; +const struct pl_render_params pl_render_default_params = { + PL_RENDER_DEFAULTS + .upscaler = &pl_filter_lanczos, + .downscaler = &pl_filter_hermite, + .frame_mixer = &pl_filter_oversample, + .sigmoid_params = &pl_sigmoid_default_params, + .dither_params = &pl_dither_default_params, + .peak_detect_params = &pl_peak_detect_default_params, +}; + +const struct pl_render_params pl_render_high_quality_params = { + PL_RENDER_DEFAULTS + .upscaler = &pl_filter_ewa_lanczossharp, + .downscaler = &pl_filter_hermite, + .frame_mixer = &pl_filter_oversample, + .sigmoid_params = &pl_sigmoid_default_params, + .peak_detect_params = &pl_peak_detect_high_quality_params, + .color_map_params = &pl_color_map_high_quality_params, + .dither_params = &pl_dither_default_params, + .deband_params = &pl_deband_default_params, +}; + +const struct pl_filter_preset pl_frame_mixers[] = { + { "none", NULL, "No frame mixing" }, + { "linear", &pl_filter_bilinear, "Linear frame mixing" }, + { "oversample", &pl_filter_oversample, "Oversample (AKA SmoothMotion)" }, + { "mitchell_clamp", &pl_filter_mitchell_clamp, "Clamped Mitchell spline" }, + { "hermite", &pl_filter_hermite, "Cubic spline (Hermite)" }, + {0} +}; + +const int pl_num_frame_mixers = PL_ARRAY_SIZE(pl_frame_mixers) - 1; + +const struct pl_filter_preset pl_scale_filters[] = { + {"none", NULL, "Built-in sampling"}, + {"oversample", &pl_filter_oversample, "Oversample (Aspect-preserving NN)"}, + COMMON_FILTER_PRESETS, + {0} +}; + +const int pl_num_scale_filters = PL_ARRAY_SIZE(pl_scale_filters) - 1; + +// Represents a "in-flight" image, which is either a shader that's in the +// process of producing some sort of image, or a texture that needs to be +// sampled from +struct img { + // Effective texture size, always set + int w, h; + + // Recommended format (falls back to fbofmt otherwise), only for shaders + pl_fmt fmt; + + // Exactly *one* of these two is set: + pl_shader sh; + pl_tex tex; + + // If true, created shaders will be set to unique + bool unique; + + // Information about what to log/disable/fallback to if the shader fails + const char *err_msg; + enum pl_render_error err_enum; + pl_tex err_tex; + + // Current effective source area, will be sampled by the main scaler + pl_rect2df rect; + + // The current effective colorspace + struct pl_color_repr repr; + struct pl_color_space color; + int comps; +}; + +// Plane 'type', ordered by incrementing priority +enum plane_type { + PLANE_INVALID = 0, + PLANE_ALPHA, + PLANE_CHROMA, + PLANE_LUMA, + PLANE_RGB, + PLANE_XYZ, +}; + +static inline enum plane_type detect_plane_type(const struct pl_plane *plane, + const struct pl_color_repr *repr) +{ + if (pl_color_system_is_ycbcr_like(repr->sys)) { + int t = PLANE_INVALID; + for (int c = 0; c < plane->components; c++) { + switch (plane->component_mapping[c]) { + case PL_CHANNEL_Y: t = PL_MAX(t, PLANE_LUMA); continue; + case PL_CHANNEL_A: t = PL_MAX(t, PLANE_ALPHA); continue; + + case PL_CHANNEL_CB: + case PL_CHANNEL_CR: + t = PL_MAX(t, PLANE_CHROMA); + continue; + + default: continue; + } + } + + pl_assert(t); + return t; + } + + // Extra test for exclusive / separated alpha plane + if (plane->components == 1 && plane->component_mapping[0] == PL_CHANNEL_A) + return PLANE_ALPHA; + + switch (repr->sys) { + case PL_COLOR_SYSTEM_UNKNOWN: // fall through to RGB + case PL_COLOR_SYSTEM_RGB: return PLANE_RGB; + case PL_COLOR_SYSTEM_XYZ: return PLANE_XYZ; + + // For the switch completeness check + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_BT_2020_C: + case PL_COLOR_SYSTEM_BT_2100_PQ: + case PL_COLOR_SYSTEM_BT_2100_HLG: + case PL_COLOR_SYSTEM_DOLBYVISION: + case PL_COLOR_SYSTEM_YCGCO: + case PL_COLOR_SYSTEM_COUNT: + break; + } + + pl_unreachable(); +} + +struct pass_state { + void *tmp; + pl_renderer rr; + const struct pl_render_params *params; + struct pl_render_info info; // for info callback + + // Represents the "current" image which we're in the process of rendering. + // This is initially set by pass_read_image, and all of the subsequent + // rendering steps will mutate this in-place. + struct img img; + + // Represents the "reference rect". Canonically, this is functionally + // equivalent to `image.crop`, but also updates as the refplane evolves + // (e.g. due to user hook prescalers) + pl_rect2df ref_rect; + + // Integer version of `target.crop`. Semantically identical. + pl_rect2d dst_rect; + + // Logical end-to-end rotation + pl_rotation rotation; + + // Cached copies of the `image` / `target` for this rendering pass, + // corrected to make sure all rects etc. are properly defaulted/inferred. + struct pl_frame image; + struct pl_frame target; + + // Cached copies of the `prev` / `next` frames, for deinterlacing. + struct pl_frame prev, next; + + // Some extra plane metadata, inferred from `planes` + enum plane_type src_type[4]; + int src_ref, dst_ref; // index into `planes` + + // Metadata for `rr->fbos` + pl_fmt fbofmt[5]; + bool *fbos_used; + bool need_peak_fbo; // need indirection for peak detection + + // Map of acquired frames + struct { + bool target, image, prev, next; + } acquired; +}; + +static void find_fbo_format(struct pass_state *pass) +{ + const struct pl_render_params *params = pass->params; + pl_renderer rr = pass->rr; + if (params->disable_fbos || (rr->errors & PL_RENDER_ERR_FBO) || pass->fbofmt[4]) + return; + + struct { + enum pl_fmt_type type; + int depth; + enum pl_fmt_caps caps; + } configs[] = { + // Prefer floating point formats first + {PL_FMT_FLOAT, 16, PL_FMT_CAP_LINEAR}, + {PL_FMT_FLOAT, 16, PL_FMT_CAP_SAMPLEABLE}, + + // Otherwise, fall back to unorm/snorm, preferring linearly sampleable + {PL_FMT_UNORM, 16, PL_FMT_CAP_LINEAR}, + {PL_FMT_SNORM, 16, PL_FMT_CAP_LINEAR}, + {PL_FMT_UNORM, 16, PL_FMT_CAP_SAMPLEABLE}, + {PL_FMT_SNORM, 16, PL_FMT_CAP_SAMPLEABLE}, + + // As a final fallback, allow 8-bit FBO formats (for UNORM only) + {PL_FMT_UNORM, 8, PL_FMT_CAP_LINEAR}, + {PL_FMT_UNORM, 8, PL_FMT_CAP_SAMPLEABLE}, + }; + + pl_fmt fmt = NULL; + for (int i = 0; i < PL_ARRAY_SIZE(configs); i++) { + if (params->force_low_bit_depth_fbos && configs[i].depth > 8) + continue; + + fmt = pl_find_fmt(rr->gpu, configs[i].type, 4, configs[i].depth, 0, + PL_FMT_CAP_RENDERABLE | configs[i].caps); + if (!fmt) + continue; + + pass->fbofmt[4] = fmt; + + // Probe the right variant for each number of channels, falling + // back to the next biggest format + for (int c = 1; c < 4; c++) { + pass->fbofmt[c] = pl_find_fmt(rr->gpu, configs[i].type, c, + configs[i].depth, 0, fmt->caps); + pass->fbofmt[c] = PL_DEF(pass->fbofmt[c], pass->fbofmt[c+1]); + } + return; + } + + PL_WARN(rr, "Found no renderable FBO format! Most features disabled"); + rr->errors |= PL_RENDER_ERR_FBO; +} + +static void info_callback(void *priv, const struct pl_dispatch_info *dinfo) +{ + struct pass_state *pass = priv; + const struct pl_render_params *params = pass->params; + if (!params->info_callback) + return; + + pass->info.pass = dinfo; + params->info_callback(params->info_priv, &pass->info); + pass->info.index++; +} + +static pl_tex get_fbo(struct pass_state *pass, int w, int h, pl_fmt fmt, + int comps, pl_debug_tag debug_tag) +{ + pl_renderer rr = pass->rr; + comps = PL_DEF(comps, 4); + fmt = PL_DEF(fmt, pass->fbofmt[comps]); + if (!fmt) + return NULL; + + struct pl_tex_params params = { + .w = w, + .h = h, + .format = fmt, + .sampleable = true, + .renderable = true, + .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE, + .storable = fmt->caps & PL_FMT_CAP_STORABLE, + .debug_tag = debug_tag, + }; + + int best_idx = -1; + int best_diff = 0; + + // Find the best-fitting texture out of rr->fbos + for (int i = 0; i < rr->fbos.num; i++) { + if (pass->fbos_used[i]) + continue; + + // Orthogonal distance, with penalty for format mismatches + int diff = abs(rr->fbos.elem[i]->params.w - w) + + abs(rr->fbos.elem[i]->params.h - h) + + ((rr->fbos.elem[i]->params.format != fmt) ? 1000 : 0); + + if (best_idx < 0 || diff < best_diff) { + best_idx = i; + best_diff = diff; + } + } + + // No texture found at all, add a new one + if (best_idx < 0) { + best_idx = rr->fbos.num; + PL_ARRAY_APPEND(rr, rr->fbos, NULL); + pl_grow(pass->tmp, &pass->fbos_used, rr->fbos.num * sizeof(bool)); + pass->fbos_used[best_idx] = false; + } + + if (!pl_tex_recreate(rr->gpu, &rr->fbos.elem[best_idx], ¶ms)) + return NULL; + + pass->fbos_used[best_idx] = true; + return rr->fbos.elem[best_idx]; +} + +// Forcibly convert an img to `tex`, dispatching where necessary +static pl_tex _img_tex(struct pass_state *pass, struct img *img, pl_debug_tag tag) +{ + if (img->tex) { + pl_assert(!img->sh); + return img->tex; + } + + pl_renderer rr = pass->rr; + pl_tex tex = get_fbo(pass, img->w, img->h, img->fmt, img->comps, tag); + img->fmt = NULL; + + if (!tex) { + PL_ERR(rr, "Failed creating FBO texture! Disabling advanced rendering.."); + memset(pass->fbofmt, 0, sizeof(pass->fbofmt)); + pl_dispatch_abort(rr->dp, &img->sh); + rr->errors |= PL_RENDER_ERR_FBO; + return img->err_tex; + } + + pl_assert(img->sh); + bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params( + .shader = &img->sh, + .target = tex, + )); + + const char *err_msg = img->err_msg; + enum pl_render_error err_enum = img->err_enum; + pl_tex err_tex = img->err_tex; + img->err_msg = NULL; + img->err_enum = PL_RENDER_ERR_NONE; + img->err_tex = NULL; + + if (!ok) { + PL_ERR(rr, "%s", PL_DEF(err_msg, "Failed dispatching intermediate pass!")); + rr->errors |= err_enum; + img->sh = pl_dispatch_begin(rr->dp); + img->tex = err_tex; + return img->tex; + } + + img->tex = tex; + return img->tex; +} + +#define img_tex(pass, img) _img_tex(pass, img, PL_DEBUG_TAG) + +// Forcibly convert an img to `sh`, sampling where necessary +static pl_shader img_sh(struct pass_state *pass, struct img *img) +{ + if (img->sh) { + pl_assert(!img->tex); + return img->sh; + } + + pl_assert(img->tex); + img->sh = pl_dispatch_begin_ex(pass->rr->dp, img->unique); + pl_shader_sample_direct(img->sh, pl_sample_src( .tex = img->tex )); + + img->tex = NULL; + return img->sh; +} + +enum sampler_type { + SAMPLER_DIRECT, // pick based on texture caps + SAMPLER_NEAREST, // direct sampling, force nearest + SAMPLER_BICUBIC, // fast bicubic scaling + SAMPLER_HERMITE, // fast hermite scaling + SAMPLER_GAUSSIAN, // fast gaussian scaling + SAMPLER_COMPLEX, // complex custom filters + SAMPLER_OVERSAMPLE, +}; + +enum sampler_dir { + SAMPLER_NOOP, // 1:1 scaling + SAMPLER_UP, // upscaling + SAMPLER_DOWN, // downscaling +}; + +enum sampler_usage { + SAMPLER_MAIN, + SAMPLER_PLANE, + SAMPLER_CONTRAST, +}; + +struct sampler_info { + const struct pl_filter_config *config; // if applicable + enum sampler_usage usage; + enum sampler_type type; + enum sampler_dir dir; + enum sampler_dir dir_sep[2]; +}; + +static struct sampler_info sample_src_info(struct pass_state *pass, + const struct pl_sample_src *src, + enum sampler_usage usage) +{ + const struct pl_render_params *params = pass->params; + struct sampler_info info = { .usage = usage }; + pl_renderer rr = pass->rr; + + float rx = src->new_w / fabsf(pl_rect_w(src->rect)); + if (rx < 1.0 - 1e-6) { + info.dir_sep[0] = SAMPLER_DOWN; + } else if (rx > 1.0 + 1e-6) { + info.dir_sep[0] = SAMPLER_UP; + } + + float ry = src->new_h / fabsf(pl_rect_h(src->rect)); + if (ry < 1.0 - 1e-6) { + info.dir_sep[1] = SAMPLER_DOWN; + } else if (ry > 1.0 + 1e-6) { + info.dir_sep[1] = SAMPLER_UP; + } + + if (params->correct_subpixel_offsets) { + if (!info.dir_sep[0] && fabsf(src->rect.x0) > 1e-6f) + info.dir_sep[0] = SAMPLER_UP; + if (!info.dir_sep[1] && fabsf(src->rect.y0) > 1e-6f) + info.dir_sep[1] = SAMPLER_UP; + } + + // We use PL_MAX so downscaling overrides upscaling when choosing scalers + info.dir = PL_MAX(info.dir_sep[0], info.dir_sep[1]); + switch (info.dir) { + case SAMPLER_DOWN: + if (usage == SAMPLER_CONTRAST) { + info.config = &pl_filter_bicubic; + } else if (usage == SAMPLER_PLANE && params->plane_downscaler) { + info.config = params->plane_downscaler; + } else { + info.config = params->downscaler; + } + break; + case SAMPLER_UP: + if (usage == SAMPLER_PLANE && params->plane_upscaler) { + info.config = params->plane_upscaler; + } else { + pl_assert(usage != SAMPLER_CONTRAST); + info.config = params->upscaler; + } + break; + case SAMPLER_NOOP: + info.type = SAMPLER_NEAREST; + return info; + } + + if ((rr->errors & PL_RENDER_ERR_SAMPLING) || !info.config) { + info.type = SAMPLER_DIRECT; + } else if (info.config->kernel == &pl_filter_function_oversample) { + info.type = SAMPLER_OVERSAMPLE; + } else { + info.type = SAMPLER_COMPLEX; + + // Try using faster replacements for GPU built-in scalers + pl_fmt texfmt = src->tex ? src->tex->params.format : pass->fbofmt[4]; + bool can_linear = texfmt->caps & PL_FMT_CAP_LINEAR; + bool can_fast = info.dir == SAMPLER_UP || params->skip_anti_aliasing; + + if (can_fast && !params->disable_builtin_scalers) { + if (can_linear && info.config == &pl_filter_bicubic) + info.type = SAMPLER_BICUBIC; + if (can_linear && info.config == &pl_filter_hermite) + info.type = SAMPLER_HERMITE; + if (can_linear && info.config == &pl_filter_gaussian) + info.type = SAMPLER_GAUSSIAN; + if (can_linear && info.config == &pl_filter_bilinear) + info.type = SAMPLER_DIRECT; + if (info.config == &pl_filter_nearest) + info.type = can_linear ? SAMPLER_NEAREST : SAMPLER_DIRECT; + } + } + + // Disable advanced scaling without FBOs + if (!pass->fbofmt[4] && info.type == SAMPLER_COMPLEX) + info.type = SAMPLER_DIRECT; + + return info; +} + +static void dispatch_sampler(struct pass_state *pass, pl_shader sh, + struct sampler *sampler, enum sampler_usage usage, + pl_tex target_tex, const struct pl_sample_src *src) +{ + const struct pl_render_params *params = pass->params; + if (!sampler) + goto fallback; + + pl_renderer rr = pass->rr; + struct sampler_info info = sample_src_info(pass, src, usage); + pl_shader_obj *lut = NULL; + switch (info.dir) { + case SAMPLER_NOOP: + goto fallback; + case SAMPLER_DOWN: + lut = &sampler->downscaler_state; + break; + case SAMPLER_UP: + lut = &sampler->upscaler_state; + break; + } + + switch (info.type) { + case SAMPLER_DIRECT: + goto fallback; + case SAMPLER_NEAREST: + pl_shader_sample_nearest(sh, src); + return; + case SAMPLER_OVERSAMPLE: + pl_shader_sample_oversample(sh, src, info.config->kernel->params[0]); + return; + case SAMPLER_BICUBIC: + pl_shader_sample_bicubic(sh, src); + return; + case SAMPLER_HERMITE: + pl_shader_sample_hermite(sh, src); + return; + case SAMPLER_GAUSSIAN: + pl_shader_sample_gaussian(sh, src); + return; + case SAMPLER_COMPLEX: + break; // continue below + } + + pl_assert(lut); + struct pl_sample_filter_params fparams = { + .filter = *info.config, + .antiring = params->antiringing_strength, + .no_widening = params->skip_anti_aliasing && usage != SAMPLER_CONTRAST, + .lut = lut, + }; + + if (target_tex) { + fparams.no_compute = !target_tex->params.storable; + } else { + fparams.no_compute = !(pass->fbofmt[4]->caps & PL_FMT_CAP_STORABLE); + } + + bool ok; + if (info.config->polar) { + // Polar samplers are always a single function call + ok = pl_shader_sample_polar(sh, src, &fparams); + } else if (info.dir_sep[0] && info.dir_sep[1]) { + // Scaling is needed in both directions + struct pl_sample_src src1 = *src, src2 = *src; + src1.new_w = src->tex->params.w; + src1.rect.x0 = 0; + src1.rect.x1 = src1.new_w;; + src2.rect.y0 = 0; + src2.rect.y1 = src1.new_h; + + pl_shader tsh = pl_dispatch_begin(rr->dp); + ok = pl_shader_sample_ortho2(tsh, &src1, &fparams); + if (!ok) { + pl_dispatch_abort(rr->dp, &tsh); + goto done; + } + + struct img img = { + .sh = tsh, + .w = src1.new_w, + .h = src1.new_h, + .comps = src->components, + }; + + src2.tex = img_tex(pass, &img); + src2.scale = 1.0; + ok = src2.tex && pl_shader_sample_ortho2(sh, &src2, &fparams); + } else { + // Scaling is needed only in one direction + ok = pl_shader_sample_ortho2(sh, src, &fparams); + } + +done: + if (!ok) { + PL_ERR(rr, "Failed dispatching scaler.. disabling"); + rr->errors |= PL_RENDER_ERR_SAMPLING; + goto fallback; + } + + return; + +fallback: + // If all else fails, fall back to auto sampling + pl_shader_sample_direct(sh, src); +} + +static void swizzle_color(pl_shader sh, int comps, const int comp_map[4], + bool force_alpha) +{ + ident_t orig = sh_fresh(sh, "orig_color"); + GLSL("vec4 "$" = color; \n" + "color = vec4(0.0, 0.0, 0.0, 1.0); \n", orig); + + static const int def_map[4] = {0, 1, 2, 3}; + comp_map = PL_DEF(comp_map, def_map); + + for (int c = 0; c < comps; c++) { + if (comp_map[c] >= 0) + GLSL("color[%d] = "$"[%d]; \n", c, orig, comp_map[c]); + } + + if (force_alpha) + GLSL("color.a = "$".a; \n", orig); +} + +// `scale` adapts from `pass->dst_rect` to the plane being rendered to +static void draw_overlays(struct pass_state *pass, pl_tex fbo, + int comps, const int comp_map[4], + const struct pl_overlay *overlays, int num, + struct pl_color_space color, struct pl_color_repr repr, + const pl_transform2x2 *output_shift) +{ + pl_renderer rr = pass->rr; + if (num <= 0 || (rr->errors & PL_RENDER_ERR_OVERLAY)) + return; + + enum pl_fmt_caps caps = fbo->params.format->caps; + if (!(rr->errors & PL_RENDER_ERR_BLENDING) && + !(caps & PL_FMT_CAP_BLENDABLE)) + { + PL_WARN(rr, "Trying to draw an overlay to a non-blendable target. " + "Alpha blending is disabled, results may be incorrect!"); + rr->errors |= PL_RENDER_ERR_BLENDING; + } + + const struct pl_frame *image = pass->src_ref >= 0 ? &pass->image : NULL; + pl_transform2x2 src_to_dst; + if (image) { + float rx = pl_rect_w(pass->dst_rect) / pl_rect_w(image->crop); + float ry = pl_rect_h(pass->dst_rect) / pl_rect_h(image->crop); + src_to_dst = (pl_transform2x2) { + .mat.m = {{ rx, 0 }, { 0, ry }}, + .c = { + pass->dst_rect.x0 - rx * image->crop.x0, + pass->dst_rect.y0 - ry * image->crop.y0, + }, + }; + + if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) { + PL_SWAP(src_to_dst.c[0], src_to_dst.c[1]); + src_to_dst.mat = (pl_matrix2x2) {{{ 0, ry }, { rx, 0 }}}; + } + } + + const struct pl_frame *target = &pass->target; + pl_rect2df dst_crop = target->crop; + pl_rect2df_rotate(&dst_crop, -pass->rotation); + pl_rect2df_normalize(&dst_crop); + + for (int n = 0; n < num; n++) { + struct pl_overlay ol = overlays[n]; + if (!ol.num_parts) + continue; + + if (!ol.coords) { + ol.coords = overlays == target->overlays + ? PL_OVERLAY_COORDS_DST_FRAME + : PL_OVERLAY_COORDS_SRC_FRAME; + } + + pl_transform2x2 tf = pl_transform2x2_identity; + switch (ol.coords) { + case PL_OVERLAY_COORDS_SRC_CROP: + if (!image) + continue; + tf.c[0] = image->crop.x0; + tf.c[1] = image->crop.y0; + // fall through + case PL_OVERLAY_COORDS_SRC_FRAME: + if (!image) + continue; + pl_transform2x2_rmul(&src_to_dst, &tf); + break; + case PL_OVERLAY_COORDS_DST_CROP: + tf.c[0] = dst_crop.x0; + tf.c[1] = dst_crop.y0; + break; + case PL_OVERLAY_COORDS_DST_FRAME: + break; + case PL_OVERLAY_COORDS_AUTO: + case PL_OVERLAY_COORDS_COUNT: + pl_unreachable(); + } + + if (output_shift) + pl_transform2x2_rmul(output_shift, &tf); + + // Construct vertex/index buffers + rr->osd_vertices.num = 0; + rr->osd_indices.num = 0; + for (int i = 0; i < ol.num_parts; i++) { + const struct pl_overlay_part *part = &ol.parts[i]; + +#define EMIT_VERT(x, y) \ + do { \ + float pos[2] = { part->dst.x, part->dst.y }; \ + pl_transform2x2_apply(&tf, pos); \ + PL_ARRAY_APPEND(rr, rr->osd_vertices, (struct osd_vertex) { \ + .pos = { \ + 2.0 * (pos[0] / fbo->params.w) - 1.0, \ + 2.0 * (pos[1] / fbo->params.h) - 1.0, \ + }, \ + .coord = { \ + part->src.x / ol.tex->params.w, \ + part->src.y / ol.tex->params.h, \ + }, \ + .color = { \ + part->color[0], part->color[1], \ + part->color[2], part->color[3], \ + }, \ + }); \ + } while (0) + + int idx_base = rr->osd_vertices.num; + EMIT_VERT(x0, y0); // idx 0: top left + EMIT_VERT(x1, y0); // idx 1: top right + EMIT_VERT(x0, y1); // idx 2: bottom left + EMIT_VERT(x1, y1); // idx 3: bottom right + PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 0); + PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 1); + PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 2); + PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 2); + PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 1); + PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 3); + } + + // Draw parts + pl_shader sh = pl_dispatch_begin(rr->dp); + ident_t tex = sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = "osd_tex", + .type = PL_DESC_SAMPLED_TEX, + }, + .binding = { + .object = ol.tex, + .sample_mode = (ol.tex->params.format->caps & PL_FMT_CAP_LINEAR) + ? PL_TEX_SAMPLE_LINEAR + : PL_TEX_SAMPLE_NEAREST, + }, + }); + + sh_describe(sh, "overlay"); + GLSL("// overlay \n"); + + switch (ol.mode) { + case PL_OVERLAY_NORMAL: + GLSL("vec4 color = textureLod("$", coord, 0.0); \n", tex); + break; + case PL_OVERLAY_MONOCHROME: + GLSL("vec4 color = osd_color; \n"); + break; + case PL_OVERLAY_MODE_COUNT: + pl_unreachable(); + }; + + static const struct pl_color_map_params osd_params = { + PL_COLOR_MAP_DEFAULTS + .tone_mapping_function = &pl_tone_map_linear, + .gamut_mapping = &pl_gamut_map_saturation, + }; + + sh->output = PL_SHADER_SIG_COLOR; + pl_shader_decode_color(sh, &ol.repr, NULL); + if (target->icc) + color.transfer = PL_COLOR_TRC_LINEAR; + pl_shader_color_map_ex(sh, &osd_params, pl_color_map_args(ol.color, color)); + if (target->icc) + pl_icc_encode(sh, target->icc, &rr->icc_state[ICC_TARGET]); + + bool premul = repr.alpha == PL_ALPHA_PREMULTIPLIED; + pl_shader_encode_color(sh, &repr); + if (ol.mode == PL_OVERLAY_MONOCHROME) { + GLSL("color.%s *= textureLod("$", coord, 0.0).r; \n", + premul ? "rgba" : "a", tex); + } + + swizzle_color(sh, comps, comp_map, true); + + struct pl_blend_params blend_params = { + .src_rgb = premul ? PL_BLEND_ONE : PL_BLEND_SRC_ALPHA, + .src_alpha = PL_BLEND_ONE, + .dst_rgb = PL_BLEND_ONE_MINUS_SRC_ALPHA, + .dst_alpha = PL_BLEND_ONE_MINUS_SRC_ALPHA, + }; + + bool ok = pl_dispatch_vertex(rr->dp, pl_dispatch_vertex_params( + .shader = &sh, + .target = fbo, + .blend_params = (rr->errors & PL_RENDER_ERR_BLENDING) + ? NULL : &blend_params, + .vertex_stride = sizeof(struct osd_vertex), + .num_vertex_attribs = ol.mode == PL_OVERLAY_NORMAL ? 2 : 3, + .vertex_attribs = rr->osd_attribs, + .vertex_position_idx = 0, + .vertex_coords = PL_COORDS_NORMALIZED, + .vertex_type = PL_PRIM_TRIANGLE_LIST, + .vertex_count = rr->osd_indices.num, + .vertex_data = rr->osd_vertices.elem, + .index_data = rr->osd_indices.elem, + )); + + if (!ok) { + PL_ERR(rr, "Failed rendering overlays!"); + rr->errors |= PL_RENDER_ERR_OVERLAY; + return; + } + } +} + +static pl_tex get_hook_tex(void *priv, int width, int height) +{ + struct pass_state *pass = priv; + + return get_fbo(pass, width, height, NULL, 4, PL_DEBUG_TAG); +} + +// Returns if any hook was applied (even if there were errors) +static bool pass_hook(struct pass_state *pass, struct img *img, + enum pl_hook_stage stage) +{ + const struct pl_render_params *params = pass->params; + pl_renderer rr = pass->rr; + if (!pass->fbofmt[4] || !stage) + return false; + + bool ret = false; + + for (int n = 0; n < params->num_hooks; n++) { + const struct pl_hook *hook = params->hooks[n]; + if (!(hook->stages & stage)) + continue; + + // Hopefully the list of disabled hooks is small, search linearly. + for (int i = 0; i < rr->disabled_hooks.num; i++) { + if (rr->disabled_hooks.elem[i] != hook->signature) + continue; + PL_TRACE(rr, "Skipping hook %d (0x%"PRIx64") stage 0x%x", + n, hook->signature, stage); + goto hook_skip; + } + + PL_TRACE(rr, "Dispatching hook %d (0x%"PRIx64") stage 0x%x", + n, hook->signature, stage); + struct pl_hook_params hparams = { + .gpu = rr->gpu, + .dispatch = rr->dp, + .get_tex = get_hook_tex, + .priv = pass, + .stage = stage, + .rect = img->rect, + .repr = img->repr, + .color = img->color, + .orig_repr = &pass->image.repr, + .orig_color = &pass->image.color, + .components = img->comps, + .src_rect = pass->ref_rect, + .dst_rect = pass->dst_rect, + }; + + // TODO: Add some sort of `test` API function to the hooks that allows + // us to skip having to touch the `img` state at all for no-ops + + switch (hook->input) { + case PL_HOOK_SIG_NONE: + break; + + case PL_HOOK_SIG_TEX: { + hparams.tex = img_tex(pass, img); + if (!hparams.tex) { + PL_ERR(rr, "Failed dispatching shader prior to hook!"); + goto hook_error; + } + break; + } + + case PL_HOOK_SIG_COLOR: + hparams.sh = img_sh(pass, img); + break; + + case PL_HOOK_SIG_COUNT: + pl_unreachable(); + } + + struct pl_hook_res res = hook->hook(hook->priv, &hparams); + if (res.failed) { + PL_ERR(rr, "Failed executing hook, disabling"); + goto hook_error; + } + + bool resizable = pl_hook_stage_resizable(stage); + switch (res.output) { + case PL_HOOK_SIG_NONE: + break; + + case PL_HOOK_SIG_TEX: + if (!resizable) { + if (res.tex->params.w != img->w || + res.tex->params.h != img->h || + !pl_rect2d_eq(res.rect, img->rect)) + { + PL_ERR(rr, "User hook tried resizing non-resizable stage!"); + goto hook_error; + } + } + + *img = (struct img) { + .tex = res.tex, + .repr = res.repr, + .color = res.color, + .comps = res.components, + .rect = res.rect, + .w = res.tex->params.w, + .h = res.tex->params.h, + .unique = img->unique, + }; + break; + + case PL_HOOK_SIG_COLOR: + if (!resizable) { + if (res.sh->output_w != img->w || + res.sh->output_h != img->h || + !pl_rect2d_eq(res.rect, img->rect)) + { + PL_ERR(rr, "User hook tried resizing non-resizable stage!"); + goto hook_error; + } + } + + *img = (struct img) { + .sh = res.sh, + .repr = res.repr, + .color = res.color, + .comps = res.components, + .rect = res.rect, + .w = res.sh->output_w, + .h = res.sh->output_h, + .unique = img->unique, + .err_enum = PL_RENDER_ERR_HOOKS, + .err_msg = "Failed applying user hook", + .err_tex = hparams.tex, // if any + }; + break; + + case PL_HOOK_SIG_COUNT: + pl_unreachable(); + } + + // a hook was performed successfully + ret = true; + +hook_skip: + continue; +hook_error: + PL_ARRAY_APPEND(rr, rr->disabled_hooks, hook->signature); + rr->errors |= PL_RENDER_ERR_HOOKS; + } + + // Make sure the state remains as valid as possible, even if the resulting + // shaders might end up nonsensical, to prevent segfaults + if (!img->tex && !img->sh) + img->sh = pl_dispatch_begin(rr->dp); + return ret; +} + +static void hdr_update_peak(struct pass_state *pass) +{ + const struct pl_render_params *params = pass->params; + pl_renderer rr = pass->rr; + if (!params->peak_detect_params || !pl_color_space_is_hdr(&pass->img.color)) + goto cleanup; + + if (rr->errors & PL_RENDER_ERR_PEAK_DETECT) + goto cleanup; + + if (pass->fbofmt[4] && !(pass->fbofmt[4]->caps & PL_FMT_CAP_STORABLE)) + goto cleanup; + + if (!rr->gpu->limits.max_ssbo_size) + goto cleanup; + + float max_peak = pl_color_transfer_nominal_peak(pass->img.color.transfer) * + PL_COLOR_SDR_WHITE; + if (pass->img.color.transfer == PL_COLOR_TRC_HLG) + max_peak = pass->img.color.hdr.max_luma; + if (max_peak <= pass->target.color.hdr.max_luma + 1e-6) + goto cleanup; // no adaptation needed + + if (pass->img.color.hdr.avg_pq_y) + goto cleanup; // DV metadata already present + + enum pl_hdr_metadata_type metadata = PL_HDR_METADATA_ANY; + if (params->color_map_params) + metadata = params->color_map_params->metadata; + + if (metadata && metadata != PL_HDR_METADATA_CIE_Y) + goto cleanup; // metadata will be unused + + const struct pl_color_map_params *cpars = params->color_map_params; + bool uses_ootf = cpars && cpars->tone_mapping_function == &pl_tone_map_st2094_40; + if (uses_ootf && pass->img.color.hdr.ootf.num_anchors) + goto cleanup; // HDR10+ OOTF is being used + + if (params->lut && params->lut_type == PL_LUT_CONVERSION) + goto cleanup; // LUT handles tone mapping + + if (!pass->fbofmt[4] && !params->peak_detect_params->allow_delayed) { + PL_WARN(rr, "Disabling peak detection because " + "`pl_peak_detect_params.allow_delayed` is false, but lack of " + "FBOs forces the result to be delayed."); + rr->errors |= PL_RENDER_ERR_PEAK_DETECT; + goto cleanup; + } + + bool ok = pl_shader_detect_peak(img_sh(pass, &pass->img), pass->img.color, + &rr->tone_map_state, params->peak_detect_params); + if (!ok) { + PL_WARN(rr, "Failed creating HDR peak detection shader.. disabling"); + rr->errors |= PL_RENDER_ERR_PEAK_DETECT; + goto cleanup; + } + + pass->need_peak_fbo = !params->peak_detect_params->allow_delayed; + return; + +cleanup: + // No peak detection required or supported, so clean up the state to avoid + // confusing it with later frames where peak detection is enabled again + pl_reset_detected_peak(rr->tone_map_state); +} + +bool pl_renderer_get_hdr_metadata(pl_renderer rr, + struct pl_hdr_metadata *metadata) +{ + return pl_get_detected_hdr_metadata(rr->tone_map_state, metadata); +} + +struct plane_state { + enum plane_type type; + struct pl_plane plane; + struct img img; // for per-plane shaders + float plane_w, plane_h; // logical plane dimensions +}; + +static const char *plane_type_names[] = { + [PLANE_INVALID] = "invalid", + [PLANE_ALPHA] = "alpha", + [PLANE_CHROMA] = "chroma", + [PLANE_LUMA] = "luma", + [PLANE_RGB] = "rgb", + [PLANE_XYZ] = "xyz", +}; + +static void log_plane_info(pl_renderer rr, const struct plane_state *st) +{ + const struct pl_plane *plane = &st->plane; + PL_TRACE(rr, " Type: %s", plane_type_names[st->type]); + + switch (plane->components) { + case 0: + PL_TRACE(rr, " Components: (none)"); + break; + case 1: + PL_TRACE(rr, " Components: {%d}", + plane->component_mapping[0]); + break; + case 2: + PL_TRACE(rr, " Components: {%d %d}", + plane->component_mapping[0], + plane->component_mapping[1]); + break; + case 3: + PL_TRACE(rr, " Components: {%d %d %d}", + plane->component_mapping[0], + plane->component_mapping[1], + plane->component_mapping[2]); + break; + case 4: + PL_TRACE(rr, " Components: {%d %d %d %d}", + plane->component_mapping[0], + plane->component_mapping[1], + plane->component_mapping[2], + plane->component_mapping[3]); + break; + } + + PL_TRACE(rr, " Rect: {%f %f} -> {%f %f}", + st->img.rect.x0, st->img.rect.y0, st->img.rect.x1, st->img.rect.y1); + + PL_TRACE(rr, " Bits: %d (used) / %d (sampled), shift %d", + st->img.repr.bits.color_depth, + st->img.repr.bits.sample_depth, + st->img.repr.bits.bit_shift); +} + +// Returns true if debanding was applied +static bool plane_deband(struct pass_state *pass, struct img *img, float neutral[3]) +{ + const struct pl_render_params *params = pass->params; + const struct pl_frame *image = &pass->image; + pl_renderer rr = pass->rr; + if ((rr->errors & PL_RENDER_ERR_DEBANDING) || + !params->deband_params || !pass->fbofmt[4]) + { + return false; + } + + struct pl_color_repr repr = img->repr; + struct pl_sample_src src = { + .tex = img_tex(pass, img), + .components = img->comps, + .scale = pl_color_repr_normalize(&repr), + }; + + if (!(src.tex->params.format->caps & PL_FMT_CAP_LINEAR)) { + PL_WARN(rr, "Debanding requires uploaded textures to be linearly " + "sampleable (params.sample_mode = PL_TEX_SAMPLE_LINEAR)! " + "Disabling debanding.."); + rr->errors |= PL_RENDER_ERR_DEBANDING; + return false; + } + + // Divide the deband grain scale by the effective current colorspace nominal + // peak, to make sure the output intensity of the grain is as independent + // of the source as possible, even though it happens this early in the + // process (well before any linearization / output adaptation) + struct pl_deband_params dparams = *params->deband_params; + dparams.grain /= image->color.hdr.max_luma / PL_COLOR_SDR_WHITE; + memcpy(dparams.grain_neutral, neutral, sizeof(dparams.grain_neutral)); + + img->tex = NULL; + img->sh = pl_dispatch_begin_ex(rr->dp, true); + pl_shader_deband(img->sh, &src, &dparams); + img->err_msg = "Failed applying debanding... disabling!"; + img->err_enum = PL_RENDER_ERR_DEBANDING; + img->err_tex = src.tex; + img->repr = repr; + return true; +} + +// Returns true if grain was applied +static bool plane_film_grain(struct pass_state *pass, int plane_idx, + struct plane_state *st, + const struct plane_state *ref) +{ + const struct pl_frame *image = &pass->image; + pl_renderer rr = pass->rr; + if (rr->errors & PL_RENDER_ERR_FILM_GRAIN) + return false; + + struct img *img = &st->img; + struct pl_plane *plane = &st->plane; + struct pl_color_repr repr = image->repr; + bool is_orig_repr = pl_color_repr_equal(&st->img.repr, &image->repr); + if (!is_orig_repr) { + // Propagate the original color depth to the film grain algorithm, but + // update the sample depth and effective bit shift based on the state + // of the current texture, which is guaranteed to already be + // normalized. + pl_assert(st->img.repr.bits.bit_shift == 0); + repr.bits.sample_depth = st->img.repr.bits.sample_depth; + repr.bits.bit_shift = repr.bits.sample_depth - repr.bits.color_depth; + } + + struct pl_film_grain_params grain_params = { + .data = image->film_grain, + .luma_tex = ref->plane.texture, + .repr = &repr, + .components = plane->components, + }; + + switch (image->film_grain.type) { + case PL_FILM_GRAIN_NONE: return false; + case PL_FILM_GRAIN_H274: break; + case PL_FILM_GRAIN_AV1: + grain_params.luma_tex = ref->plane.texture; + for (int c = 0; c < ref->plane.components; c++) { + if (ref->plane.component_mapping[c] == PL_CHANNEL_Y) + grain_params.luma_comp = c; + } + break; + default: pl_unreachable(); + } + + for (int c = 0; c < plane->components; c++) + grain_params.component_mapping[c] = plane->component_mapping[c]; + + if (!pl_needs_film_grain(&grain_params)) + return false; + + if (!pass->fbofmt[plane->components]) { + PL_ERR(rr, "Film grain required but no renderable format available.. " + "disabling!"); + rr->errors |= PL_RENDER_ERR_FILM_GRAIN; + return false; + } + + grain_params.tex = img_tex(pass, img); + if (!grain_params.tex) + return false; + + img->sh = pl_dispatch_begin_ex(rr->dp, true); + if (!pl_shader_film_grain(img->sh, &rr->grain_state[plane_idx], &grain_params)) { + pl_dispatch_abort(rr->dp, &img->sh); + rr->errors |= PL_RENDER_ERR_FILM_GRAIN; + return false; + } + + img->tex = NULL; + img->err_msg = "Failed applying film grain.. disabling!"; + img->err_enum = PL_RENDER_ERR_FILM_GRAIN; + img->err_tex = grain_params.tex; + if (is_orig_repr) + img->repr = repr; + return true; +} + +static const enum pl_hook_stage plane_hook_stages[] = { + [PLANE_ALPHA] = PL_HOOK_ALPHA_INPUT, + [PLANE_CHROMA] = PL_HOOK_CHROMA_INPUT, + [PLANE_LUMA] = PL_HOOK_LUMA_INPUT, + [PLANE_RGB] = PL_HOOK_RGB_INPUT, + [PLANE_XYZ] = PL_HOOK_XYZ_INPUT, +}; + +static const enum pl_hook_stage plane_scaled_hook_stages[] = { + [PLANE_ALPHA] = PL_HOOK_ALPHA_SCALED, + [PLANE_CHROMA] = PL_HOOK_CHROMA_SCALED, + [PLANE_LUMA] = 0, // never hooked + [PLANE_RGB] = 0, + [PLANE_XYZ] = 0, +}; + +static enum pl_lut_type guess_frame_lut_type(const struct pl_frame *frame, + bool reversed) +{ + if (!frame->lut) + return PL_LUT_UNKNOWN; + if (frame->lut_type) + return frame->lut_type; + + enum pl_color_system sys_in = frame->lut->repr_in.sys; + enum pl_color_system sys_out = frame->lut->repr_out.sys; + if (reversed) + PL_SWAP(sys_in, sys_out); + + if (sys_in == PL_COLOR_SYSTEM_RGB && sys_out == sys_in) + return PL_LUT_NORMALIZED; + + if (sys_in == frame->repr.sys && sys_out == PL_COLOR_SYSTEM_RGB) + return PL_LUT_CONVERSION; + + // Unknown, just fall back to the default + return PL_LUT_NATIVE; +} + +static pl_fmt merge_fmt(struct pass_state *pass, const struct img *a, + const struct img *b) +{ + pl_renderer rr = pass->rr; + pl_fmt fmta = a->tex ? a->tex->params.format : PL_DEF(a->fmt, pass->fbofmt[a->comps]); + pl_fmt fmtb = b->tex ? b->tex->params.format : PL_DEF(b->fmt, pass->fbofmt[b->comps]); + pl_assert(fmta && fmtb); + if (fmta->type != fmtb->type) + return NULL; + + int num_comps = PL_MIN(4, a->comps + b->comps); + int min_depth = PL_MAX(a->repr.bits.sample_depth, b->repr.bits.sample_depth); + + // Only return formats that support all relevant caps of both formats + const enum pl_fmt_caps mask = PL_FMT_CAP_SAMPLEABLE | PL_FMT_CAP_LINEAR; + enum pl_fmt_caps req_caps = (fmta->caps & mask) | (fmtb->caps & mask); + + return pl_find_fmt(rr->gpu, fmta->type, num_comps, min_depth, 0, req_caps); +} + +// Applies a series of rough heuristics to figure out whether we expect any +// performance gains from plane merging. This is basically a series of checks +// for operations that we *know* benefit from merged planes +static bool want_merge(struct pass_state *pass, + const struct plane_state *st, + const struct plane_state *ref) +{ + const struct pl_render_params *params = pass->params; + const pl_renderer rr = pass->rr; + if (!pass->fbofmt[4]) + return false; + + // Debanding + if (!(rr->errors & PL_RENDER_ERR_DEBANDING) && params->deband_params) + return true; + + // Other plane hooks, which are generally nontrivial + enum pl_hook_stage stage = plane_hook_stages[st->type]; + for (int i = 0; i < params->num_hooks; i++) { + if (params->hooks[i]->stages & stage) + return true; + } + + // Non-trivial scaling + struct pl_sample_src src = { + .new_w = ref->img.w, + .new_h = ref->img.h, + .rect = { + .x1 = st->img.w, + .y1 = st->img.h, + }, + }; + + struct sampler_info info = sample_src_info(pass, &src, SAMPLER_PLANE); + if (info.type == SAMPLER_COMPLEX) + return true; + + // Film grain synthesis, can be merged for compatible channels, saving on + // redundant sampling of the grain/offset textures + struct pl_film_grain_params grain_params = { + .data = pass->image.film_grain, + .repr = (struct pl_color_repr *) &st->img.repr, + .components = st->plane.components, + }; + + for (int c = 0; c < st->plane.components; c++) + grain_params.component_mapping[c] = st->plane.component_mapping[c]; + + if (!(rr->errors & PL_RENDER_ERR_FILM_GRAIN) && + pl_needs_film_grain(&grain_params)) + { + return true; + } + + return false; +} + +// This scales and merges all of the source images, and initializes pass->img. +static bool pass_read_image(struct pass_state *pass) +{ + const struct pl_render_params *params = pass->params; + struct pl_frame *image = &pass->image; + pl_renderer rr = pass->rr; + + struct plane_state planes[4]; + struct plane_state *ref = &planes[pass->src_ref]; + pl_assert(pass->src_ref >= 0 && pass->src_ref < image->num_planes); + + for (int i = 0; i < image->num_planes; i++) { + planes[i] = (struct plane_state) { + .type = detect_plane_type(&image->planes[i], &image->repr), + .plane = image->planes[i], + .img = { + .w = image->planes[i].texture->params.w, + .h = image->planes[i].texture->params.h, + .tex = image->planes[i].texture, + .repr = image->repr, + .color = image->color, + .comps = image->planes[i].components, + }, + }; + + // Deinterlace plane if needed + if (image->field != PL_FIELD_NONE && params->deinterlace_params && + pass->fbofmt[4] && !(rr->errors & PL_RENDER_ERR_DEINTERLACING)) + { + struct img *img = &planes[i].img; + struct pl_deinterlace_source src = { + .cur.top = img->tex, + .prev.top = image->prev ? image->prev->planes[i].texture : NULL, + .next.top = image->next ? image->next->planes[i].texture : NULL, + .field = image->field, + .first_field = image->first_field, + .component_mask = (1 << img->comps) - 1, + }; + + img->tex = NULL; + img->sh = pl_dispatch_begin_ex(pass->rr->dp, true); + pl_shader_deinterlace(img->sh, &src, params->deinterlace_params); + img->err_msg = "Failed deinterlacing plane.. disabling!"; + img->err_enum = PL_RENDER_ERR_DEINTERLACING; + img->err_tex = planes[i].plane.texture; + } + } + + // Original ref texture, even after preprocessing + pl_tex ref_tex = ref->plane.texture; + + // Merge all compatible planes into 'combined' shaders + for (int i = 0; i < image->num_planes; i++) { + struct plane_state *sti = &planes[i]; + if (!sti->type) + continue; + if (!want_merge(pass, sti, ref)) + continue; + + bool did_merge = false; + for (int j = i+1; j < image->num_planes; j++) { + struct plane_state *stj = &planes[j]; + bool merge = sti->type == stj->type && + sti->img.w == stj->img.w && + sti->img.h == stj->img.h && + sti->plane.shift_x == stj->plane.shift_x && + sti->plane.shift_y == stj->plane.shift_y; + if (!merge) + continue; + + pl_fmt fmt = merge_fmt(pass, &sti->img, &stj->img); + if (!fmt) + continue; + + PL_TRACE(rr, "Merging plane %d into plane %d", j, i); + pl_shader sh = sti->img.sh; + if (!sh) { + sh = sti->img.sh = pl_dispatch_begin_ex(pass->rr->dp, true); + pl_shader_sample_direct(sh, pl_sample_src( .tex = sti->img.tex )); + sti->img.tex = NULL; + } + + pl_shader psh = NULL; + if (!stj->img.sh) { + psh = pl_dispatch_begin_ex(pass->rr->dp, true); + pl_shader_sample_direct(psh, pl_sample_src( .tex = stj->img.tex )); + } + + ident_t sub = sh_subpass(sh, psh ? psh : stj->img.sh); + pl_dispatch_abort(rr->dp, &psh); + if (!sub) + break; // skip merging + + sh_describe(sh, "merging planes"); + GLSL("{ \n" + "vec4 tmp = "$"(); \n", sub); + for (int jc = 0; jc < stj->img.comps; jc++) { + int map = stj->plane.component_mapping[jc]; + if (map == PL_CHANNEL_NONE) + continue; + int ic = sti->img.comps++; + pl_assert(ic < 4); + GLSL("color[%d] = tmp[%d]; \n", ic, jc); + sti->plane.components = sti->img.comps; + sti->plane.component_mapping[ic] = map; + } + GLSL("} \n"); + + sti->img.fmt = fmt; + pl_dispatch_abort(rr->dp, &stj->img.sh); + *stj = (struct plane_state) {0}; + did_merge = true; + } + + if (!did_merge) + continue; + + if (!img_tex(pass, &sti->img)) { + PL_ERR(rr, "Failed dispatching plane merging shader, disabling FBOs!"); + memset(pass->fbofmt, 0, sizeof(pass->fbofmt)); + rr->errors |= PL_RENDER_ERR_FBO; + return false; + } + } + + int bits = image->repr.bits.sample_depth; + float out_scale = bits ? (1llu << bits) / ((1llu << bits) - 1.0f) : 1.0f; + float neutral_luma = 0.0, neutral_chroma = 0.5f * out_scale; + if (pl_color_levels_guess(&image->repr) == PL_COLOR_LEVELS_LIMITED) + neutral_luma = 16 / 256.0f * out_scale; + if (!pl_color_system_is_ycbcr_like(image->repr.sys)) + neutral_chroma = neutral_luma; + + // Compute the sampling rc of each plane + for (int i = 0; i < image->num_planes; i++) { + struct plane_state *st = &planes[i]; + if (!st->type) + continue; + + float rx = (float) st->plane.texture->params.w / ref_tex->params.w, + ry = (float) st->plane.texture->params.h / ref_tex->params.h; + + // Only accept integer scaling ratios. This accounts for the fact that + // fractionally subsampled planes get rounded up to the nearest integer + // size, which we want to discard. + float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx), + rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry); + + float sx = st->plane.shift_x, + sy = st->plane.shift_y; + + st->img.rect = (pl_rect2df) { + .x0 = (image->crop.x0 - sx) * rrx, + .y0 = (image->crop.y0 - sy) * rry, + .x1 = (image->crop.x1 - sx) * rrx, + .y1 = (image->crop.y1 - sy) * rry, + }; + + st->plane_w = ref_tex->params.w * rrx; + st->plane_h = ref_tex->params.h * rry; + + PL_TRACE(rr, "Plane %d:", i); + log_plane_info(rr, st); + + float neutral[3] = {0.0}; + for (int c = 0, idx = 0; c < st->plane.components; c++) { + switch (st->plane.component_mapping[c]) { + case PL_CHANNEL_Y: neutral[idx++] = neutral_luma; break; + case PL_CHANNEL_U: // fall through + case PL_CHANNEL_V: neutral[idx++] = neutral_chroma; break; + } + } + + // The order of operations (deband -> film grain -> user hooks) is + // chosen to maximize quality. Note that film grain requires unmodified + // plane sizes, so it has to be before user hooks. As for debanding, + // it's reduced in quality after e.g. plane scalers as well. It's also + // made less effective by performing film grain synthesis first. + + if (plane_deband(pass, &st->img, neutral)) { + PL_TRACE(rr, "After debanding:"); + log_plane_info(rr, st); + } + + if (plane_film_grain(pass, i, st, ref)) { + PL_TRACE(rr, "After film grain:"); + log_plane_info(rr, st); + } + + if (pass_hook(pass, &st->img, plane_hook_stages[st->type])) { + PL_TRACE(rr, "After user hooks:"); + log_plane_info(rr, st); + } + } + + pl_shader sh = pl_dispatch_begin_ex(rr->dp, true); + sh_require(sh, PL_SHADER_SIG_NONE, 0, 0); + + // Initialize the color to black + GLSL("vec4 color = vec4("$", vec2("$"), 1.0); \n" + "// pass_read_image \n" + "{ \n" + "vec4 tmp; \n", + SH_FLOAT(neutral_luma), SH_FLOAT(neutral_chroma)); + + // For quality reasons, explicitly drop subpixel offsets from the ref rect + // and re-add them as part of `pass->img.rect`, always rounding towards 0. + // Additionally, drop anamorphic subpixel mismatches. + pl_rect2d ref_rounded; + ref_rounded.x0 = truncf(ref->img.rect.x0); + ref_rounded.y0 = truncf(ref->img.rect.y0); + ref_rounded.x1 = ref_rounded.x0 + roundf(pl_rect_w(ref->img.rect)); + ref_rounded.y1 = ref_rounded.y0 + roundf(pl_rect_h(ref->img.rect)); + + PL_TRACE(rr, "Rounded reference rect: {%d %d %d %d}", + ref_rounded.x0, ref_rounded.y0, + ref_rounded.x1, ref_rounded.y1); + + float off_x = ref->img.rect.x0 - ref_rounded.x0, + off_y = ref->img.rect.y0 - ref_rounded.y0, + stretch_x = pl_rect_w(ref_rounded) / pl_rect_w(ref->img.rect), + stretch_y = pl_rect_h(ref_rounded) / pl_rect_h(ref->img.rect); + + for (int i = 0; i < image->num_planes; i++) { + struct plane_state *st = &planes[i]; + const struct pl_plane *plane = &st->plane; + if (!st->type) + continue; + + float scale_x = pl_rect_w(st->img.rect) / pl_rect_w(ref->img.rect), + scale_y = pl_rect_h(st->img.rect) / pl_rect_h(ref->img.rect), + base_x = st->img.rect.x0 - scale_x * off_x, + base_y = st->img.rect.y0 - scale_y * off_y; + + struct pl_sample_src src = { + .components = plane->components, + .address_mode = plane->address_mode, + .scale = pl_color_repr_normalize(&st->img.repr), + .new_w = pl_rect_w(ref_rounded), + .new_h = pl_rect_h(ref_rounded), + .rect = { + base_x, + base_y, + base_x + stretch_x * pl_rect_w(st->img.rect), + base_y + stretch_y * pl_rect_h(st->img.rect), + }, + }; + + if (plane->flipped) { + src.rect.y0 = st->plane_h - src.rect.y0; + src.rect.y1 = st->plane_h - src.rect.y1; + } + + PL_TRACE(rr, "Aligning plane %d: {%f %f %f %f} -> {%f %f %f %f}%s", + i, st->img.rect.x0, st->img.rect.y0, + st->img.rect.x1, st->img.rect.y1, + src.rect.x0, src.rect.y0, + src.rect.x1, src.rect.y1, + plane->flipped ? " (flipped) " : ""); + + st->img.unique = true; + pl_rect2d unscaled = { .x1 = src.new_w, .y1 = src.new_h }; + if (st->img.sh && st->img.w == src.new_w && st->img.h == src.new_h && + pl_rect2d_eq(src.rect, unscaled)) + { + // Image rects are already equal, no indirect scaling needed + } else { + src.tex = img_tex(pass, &st->img); + st->img.tex = NULL; + st->img.sh = pl_dispatch_begin_ex(rr->dp, true); + dispatch_sampler(pass, st->img.sh, &rr->samplers_src[i], + SAMPLER_PLANE, NULL, &src); + st->img.err_enum |= PL_RENDER_ERR_SAMPLING; + st->img.rect.x0 = st->img.rect.y0 = 0.0f; + st->img.w = st->img.rect.x1 = src.new_w; + st->img.h = st->img.rect.y1 = src.new_h; + } + + pass_hook(pass, &st->img, plane_scaled_hook_stages[st->type]); + ident_t sub = sh_subpass(sh, img_sh(pass, &st->img)); + if (!sub) { + if (!img_tex(pass, &st->img)) { + pl_dispatch_abort(rr->dp, &sh); + return false; + } + + sub = sh_subpass(sh, img_sh(pass, &st->img)); + pl_assert(sub); + } + + GLSL("tmp = "$"(); \n", sub); + for (int c = 0; c < src.components; c++) { + if (plane->component_mapping[c] < 0) + continue; + GLSL("color[%d] = tmp[%d];\n", plane->component_mapping[c], c); + } + + // we don't need it anymore + pl_dispatch_abort(rr->dp, &st->img.sh); + } + + GLSL("}\n"); + + pass->img = (struct img) { + .sh = sh, + .w = pl_rect_w(ref_rounded), + .h = pl_rect_h(ref_rounded), + .repr = ref->img.repr, + .color = image->color, + .comps = ref->img.repr.alpha ? 4 : 3, + .rect = { + off_x, + off_y, + off_x + pl_rect_w(ref->img.rect), + off_y + pl_rect_h(ref->img.rect), + }, + }; + + // Update the reference rect to our adjusted image coordinates + pass->ref_rect = pass->img.rect; + + pass_hook(pass, &pass->img, PL_HOOK_NATIVE); + + // Apply LUT logic and colorspace conversion + enum pl_lut_type lut_type = guess_frame_lut_type(image, false); + sh = img_sh(pass, &pass->img); + bool needs_conversion = true; + + if (lut_type == PL_LUT_NATIVE || lut_type == PL_LUT_CONVERSION) { + // Fix bit depth normalization before applying LUT + float scale = pl_color_repr_normalize(&pass->img.repr); + GLSL("color *= vec4("$"); \n", SH_FLOAT(scale)); + pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_INDEPENDENT); + pl_shader_custom_lut(sh, image->lut, &rr->lut_state[LUT_IMAGE]); + + if (lut_type == PL_LUT_CONVERSION) { + pass->img.repr.sys = PL_COLOR_SYSTEM_RGB; + pass->img.repr.levels = PL_COLOR_LEVELS_FULL; + needs_conversion = false; + } + } + + if (needs_conversion) { + if (pass->img.repr.sys == PL_COLOR_SYSTEM_XYZ) + pass->img.color.transfer = PL_COLOR_TRC_LINEAR; + pl_shader_decode_color(sh, &pass->img.repr, params->color_adjustment); + } + + if (lut_type == PL_LUT_NORMALIZED) + pl_shader_custom_lut(sh, image->lut, &rr->lut_state[LUT_IMAGE]); + + // A main PL_LUT_CONVERSION LUT overrides ICC profiles + bool main_lut_override = params->lut && params->lut_type == PL_LUT_CONVERSION; + if (image->icc && !main_lut_override) { + pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_INDEPENDENT); + pl_icc_decode(sh, image->icc, &rr->icc_state[ICC_IMAGE], &pass->img.color); + } + + // Pre-multiply alpha channel before the rest of the pipeline, to avoid + // bleeding colors from transparent regions into non-transparent regions + pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_PREMULTIPLIED); + + pass_hook(pass, &pass->img, PL_HOOK_RGB); + sh = NULL; + return true; +} + +static bool pass_scale_main(struct pass_state *pass) +{ + const struct pl_render_params *params = pass->params; + pl_renderer rr = pass->rr; + + pl_fmt fbofmt = pass->fbofmt[pass->img.comps]; + if (!fbofmt) { + PL_TRACE(rr, "Skipping main scaler (no FBOs)"); + return true; + } + + const pl_rect2df new_rect = { + .x1 = abs(pl_rect_w(pass->dst_rect)), + .y1 = abs(pl_rect_h(pass->dst_rect)), + }; + + struct img *img = &pass->img; + struct pl_sample_src src = { + .components = img->comps, + .new_w = pl_rect_w(new_rect), + .new_h = pl_rect_h(new_rect), + .rect = img->rect, + }; + + const struct pl_frame *image = &pass->image; + bool need_fbo = false; + + // Force FBO indirection if this shader is non-resizable + int out_w, out_h; + if (img->sh && pl_shader_output_size(img->sh, &out_w, &out_h)) + need_fbo |= out_w != src.new_w || out_h != src.new_h; + + struct sampler_info info = sample_src_info(pass, &src, SAMPLER_MAIN); + bool use_sigmoid = info.dir == SAMPLER_UP && params->sigmoid_params; + bool use_linear = info.dir == SAMPLER_DOWN; + + // Opportunistically update peak here if it would save performance + if (info.dir == SAMPLER_UP) + hdr_update_peak(pass); + + // We need to enable the full rendering pipeline if there are any user + // shaders / hooks that might depend on it. + uint64_t scaling_hooks = PL_HOOK_PRE_KERNEL | PL_HOOK_POST_KERNEL; + uint64_t linear_hooks = PL_HOOK_LINEAR | PL_HOOK_SIGMOID; + + for (int i = 0; i < params->num_hooks; i++) { + if (params->hooks[i]->stages & (scaling_hooks | linear_hooks)) { + need_fbo = true; + if (params->hooks[i]->stages & linear_hooks) + use_linear = true; + if (params->hooks[i]->stages & PL_HOOK_SIGMOID) + use_sigmoid = true; + } + } + + if (info.dir == SAMPLER_NOOP && !need_fbo) { + pl_assert(src.new_w == img->w && src.new_h == img->h); + PL_TRACE(rr, "Skipping main scaler (would be no-op)"); + goto done; + } + + if (info.type == SAMPLER_DIRECT && !need_fbo) { + img->w = src.new_w; + img->h = src.new_h; + img->rect = new_rect; + PL_TRACE(rr, "Skipping main scaler (free sampling)"); + goto done; + } + + // Hard-disable both sigmoidization and linearization when required + if (params->disable_linear_scaling || fbofmt->component_depth[0] < 16) + use_sigmoid = use_linear = false; + + // Avoid sigmoidization for HDR content because it clips to [0,1], and + // linearization because it causes very nasty ringing artefacts. + if (pl_color_space_is_hdr(&img->color)) + use_sigmoid = use_linear = false; + + if (!(use_linear || use_sigmoid) && img->color.transfer == PL_COLOR_TRC_LINEAR) { + img->color.transfer = image->color.transfer; + if (image->color.transfer == PL_COLOR_TRC_LINEAR) + img->color.transfer = PL_COLOR_TRC_GAMMA22; // arbitrary fallback + pl_shader_delinearize(img_sh(pass, img), &img->color); + } + + if (use_linear || use_sigmoid) { + pl_shader_linearize(img_sh(pass, img), &img->color); + img->color.transfer = PL_COLOR_TRC_LINEAR; + pass_hook(pass, img, PL_HOOK_LINEAR); + } + + if (use_sigmoid) { + pl_shader_sigmoidize(img_sh(pass, img), params->sigmoid_params); + pass_hook(pass, img, PL_HOOK_SIGMOID); + } + + pass_hook(pass, img, PL_HOOK_PRE_KERNEL); + + src.tex = img_tex(pass, img); + if (!src.tex) + return false; + pass->need_peak_fbo = false; + + pl_shader sh = pl_dispatch_begin_ex(rr->dp, true); + dispatch_sampler(pass, sh, &rr->sampler_main, SAMPLER_MAIN, NULL, &src); + img->tex = NULL; + img->sh = sh; + img->w = src.new_w; + img->h = src.new_h; + img->rect = new_rect; + + pass_hook(pass, img, PL_HOOK_POST_KERNEL); + + if (use_sigmoid) + pl_shader_unsigmoidize(img_sh(pass, img), params->sigmoid_params); + +done: + if (info.dir != SAMPLER_UP) + hdr_update_peak(pass); + pass_hook(pass, img, PL_HOOK_SCALED); + return true; +} + +static pl_tex get_feature_map(struct pass_state *pass) +{ + const struct pl_render_params *params = pass->params; + pl_renderer rr = pass->rr; + const struct pl_color_map_params *cparams = params->color_map_params; + cparams = PL_DEF(cparams, &pl_color_map_default_params); + if (!cparams->contrast_recovery || cparams->contrast_smoothness <= 1) + return NULL; + if (!pass->fbofmt[4]) + return NULL; + if (!pl_color_space_is_hdr(&pass->img.color)) + return NULL; + if (rr->errors & (PL_RENDER_ERR_SAMPLING | PL_RENDER_ERR_CONTRAST_RECOVERY)) + return NULL; + if (pass->img.color.hdr.max_luma <= pass->target.color.hdr.max_luma + 1e-6) + return NULL; // no adaptation needed + if (params->lut && params->lut_type == PL_LUT_CONVERSION) + return NULL; // LUT handles tone mapping + + struct img *img = &pass->img; + if (!img_tex(pass, img)) + return NULL; + + const float ratio = cparams->contrast_smoothness; + const int cr_w = ceilf(abs(pl_rect_w(pass->dst_rect)) / ratio); + const int cr_h = ceilf(abs(pl_rect_h(pass->dst_rect)) / ratio); + pl_tex inter_tex = get_fbo(pass, img->w, img->h, NULL, 1, PL_DEBUG_TAG); + pl_tex out_tex = get_fbo(pass, cr_w, cr_h, NULL, 1, PL_DEBUG_TAG); + if (!inter_tex || !out_tex) + goto error; + + pl_shader sh = pl_dispatch_begin(rr->dp); + pl_shader_sample_direct(sh, pl_sample_src( .tex = img->tex )); + pl_shader_extract_features(sh, img->color); + bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params( + .shader = &sh, + .target = inter_tex, + )); + if (!ok) + goto error; + + const struct pl_sample_src src = { + .tex = inter_tex, + .rect = img->rect, + .address_mode = PL_TEX_ADDRESS_MIRROR, + .components = 1, + .new_w = cr_w, + .new_h = cr_h, + }; + + sh = pl_dispatch_begin(rr->dp); + dispatch_sampler(pass, sh, &rr->sampler_contrast, SAMPLER_CONTRAST, out_tex, &src); + ok = pl_dispatch_finish(rr->dp, pl_dispatch_params( + .shader = &sh, + .target = out_tex, + )); + if (!ok) + goto error; + + return out_tex; + +error: + PL_ERR(rr, "Failed extracting luma for contrast recovery, disabling"); + rr->errors |= PL_RENDER_ERR_CONTRAST_RECOVERY; + return NULL; +} + +// Transforms image into the output color space (tone-mapping, ICC 3DLUT, etc) +static void pass_convert_colors(struct pass_state *pass) +{ + const struct pl_render_params *params = pass->params; + const struct pl_frame *image = &pass->image; + const struct pl_frame *target = &pass->target; + pl_renderer rr = pass->rr; + + struct img *img = &pass->img; + pl_shader sh = img_sh(pass, img); + + bool prelinearized = false; + bool need_conversion = true; + assert(image->color.primaries == img->color.primaries); + if (img->color.transfer == PL_COLOR_TRC_LINEAR) { + if (img->repr.alpha == PL_ALPHA_PREMULTIPLIED) { + // Very annoying edge case: since prelinerization happens with + // premultiplied alpha, but color mapping happens with independent + // alpha, we need to go back to non-linear representation *before* + // alpha mode conversion, to avoid distortion + img->color.transfer = image->color.transfer; + pl_shader_delinearize(sh, &img->color); + } else { + prelinearized = true; + } + } else if (img->color.transfer != image->color.transfer) { + if (image->color.transfer == PL_COLOR_TRC_LINEAR) { + // Another annoying edge case: if the input is linear light, but we + // decide to un-linearize it for scaling purposes, we need to + // re-linearize before passing it into `pl_shader_color_map` + pl_shader_linearize(sh, &img->color); + img->color.transfer = PL_COLOR_TRC_LINEAR; + } + } + + // Do all processing in independent alpha, to avoid nonlinear distortions + pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_INDEPENDENT); + + // Apply color blindness simulation if requested + if (params->cone_params) + pl_shader_cone_distort(sh, img->color, params->cone_params); + + if (params->lut) { + struct pl_color_space lut_in = params->lut->color_in; + struct pl_color_space lut_out = params->lut->color_out; + switch (params->lut_type) { + case PL_LUT_UNKNOWN: + case PL_LUT_NATIVE: + pl_color_space_merge(&lut_in, &image->color); + pl_color_space_merge(&lut_out, &image->color); + break; + case PL_LUT_CONVERSION: + pl_color_space_merge(&lut_in, &image->color); + need_conversion = false; // conversion LUT the highest priority + break; + case PL_LUT_NORMALIZED: + if (!prelinearized) { + // PL_LUT_NORMALIZED wants linear input data + pl_shader_linearize(sh, &img->color); + img->color.transfer = PL_COLOR_TRC_LINEAR; + prelinearized = true; + } + pl_color_space_merge(&lut_in, &img->color); + pl_color_space_merge(&lut_out, &img->color); + break; + } + + pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args( + .src = image->color, + .dst = lut_in, + .prelinearized = prelinearized, + )); + + if (params->lut_type == PL_LUT_NORMALIZED) { + GLSLF("color.rgb *= vec3(1.0/"$"); \n", + SH_FLOAT(pl_color_transfer_nominal_peak(lut_in.transfer))); + } + + pl_shader_custom_lut(sh, params->lut, &rr->lut_state[LUT_PARAMS]); + + if (params->lut_type == PL_LUT_NORMALIZED) { + GLSLF("color.rgb *= vec3("$"); \n", + SH_FLOAT(pl_color_transfer_nominal_peak(lut_out.transfer))); + } + + if (params->lut_type != PL_LUT_CONVERSION) { + pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args( + .src = lut_out, + .dst = img->color, + )); + } + } + + if (need_conversion) { + struct pl_color_space target_csp = target->color; + if (target->icc) + target_csp.transfer = PL_COLOR_TRC_LINEAR; + + if (pass->need_peak_fbo && !img_tex(pass, img)) + return; + + // generate HDR feature map if required + pl_tex feature_map = get_feature_map(pass); + sh = img_sh(pass, img); // `get_feature_map` dispatches previous shader + + // current -> target + pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args( + .src = image->color, + .dst = target_csp, + .prelinearized = prelinearized, + .state = &rr->tone_map_state, + .feature_map = feature_map, + )); + + if (target->icc) + pl_icc_encode(sh, target->icc, &rr->icc_state[ICC_TARGET]); + } + + enum pl_lut_type lut_type = guess_frame_lut_type(target, true); + if (lut_type == PL_LUT_NORMALIZED || lut_type == PL_LUT_CONVERSION) + pl_shader_custom_lut(sh, target->lut, &rr->lut_state[LUT_TARGET]); + + img->color = target->color; +} + +// Returns true if error diffusion was successfully performed +static bool pass_error_diffusion(struct pass_state *pass, pl_shader *sh, + int new_depth, int comps, int out_w, int out_h) +{ + const struct pl_render_params *params = pass->params; + pl_renderer rr = pass->rr; + if (!params->error_diffusion || (rr->errors & PL_RENDER_ERR_ERROR_DIFFUSION)) + return false; + + size_t shmem_req = pl_error_diffusion_shmem_req(params->error_diffusion, out_h); + if (shmem_req > rr->gpu->glsl.max_shmem_size) { + PL_TRACE(rr, "Disabling error diffusion due to shmem requirements (%zu) " + "exceeding capabilities (%zu)", shmem_req, rr->gpu->glsl.max_shmem_size); + return false; + } + + pl_fmt fmt = pass->fbofmt[comps]; + if (!fmt || !(fmt->caps & PL_FMT_CAP_STORABLE)) { + PL_ERR(rr, "Error diffusion requires storable FBOs but GPU does not " + "provide them... disabling!"); + goto error; + } + + struct pl_error_diffusion_params edpars = { + .new_depth = new_depth, + .kernel = params->error_diffusion, + }; + + // Create temporary framebuffers + edpars.input_tex = get_fbo(pass, out_w, out_h, fmt, comps, PL_DEBUG_TAG); + edpars.output_tex = get_fbo(pass, out_w, out_h, fmt, comps, PL_DEBUG_TAG); + if (!edpars.input_tex || !edpars.output_tex) + goto error; + + pl_shader dsh = pl_dispatch_begin(rr->dp); + if (!pl_shader_error_diffusion(dsh, &edpars)) { + pl_dispatch_abort(rr->dp, &dsh); + goto error; + } + + // Everything was okay, run the shaders + bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params( + .shader = sh, + .target = edpars.input_tex, + )); + + if (ok) { + ok = pl_dispatch_compute(rr->dp, pl_dispatch_compute_params( + .shader = &dsh, + .dispatch_size = {1, 1, 1}, + )); + } + + *sh = pl_dispatch_begin(rr->dp); + pl_shader_sample_direct(*sh, pl_sample_src( + .tex = ok ? edpars.output_tex : edpars.input_tex, + )); + return ok; + +error: + rr->errors |= PL_RENDER_ERR_ERROR_DIFFUSION; + return false; +} + +#define CLEAR_COL(params) \ + (float[4]) { \ + (params)->background_color[0], \ + (params)->background_color[1], \ + (params)->background_color[2], \ + 1.0 - (params)->background_transparency, \ + } + +static bool pass_output_target(struct pass_state *pass) +{ + const struct pl_render_params *params = pass->params; + const struct pl_frame *image = &pass->image; + const struct pl_frame *target = &pass->target; + pl_renderer rr = pass->rr; + + struct img *img = &pass->img; + pl_shader sh = img_sh(pass, img); + + if (params->corner_rounding > 0.0f) { + const float out_w2 = fabsf(pl_rect_w(target->crop)) / 2.0f; + const float out_h2 = fabsf(pl_rect_h(target->crop)) / 2.0f; + const float radius = fminf(params->corner_rounding, 1.0f) * + fminf(out_w2, out_h2); + const struct pl_rect2df relpos = { + .x0 = -out_w2, .y0 = -out_h2, + .x1 = out_w2, .y1 = out_h2, + }; + GLSL("float radius = "$"; \n" + "vec2 size2 = vec2("$", "$"); \n" + "vec2 relpos = "$"; \n" + "vec2 rd = abs(relpos) - size2 + vec2(radius); \n" + "float rdist = length(max(rd, 0.0)) - radius; \n" + "float border = smoothstep(2.0f, 0.0f, rdist); \n", + SH_FLOAT_DYN(radius), + SH_FLOAT_DYN(out_w2), SH_FLOAT_DYN(out_h2), + sh_attr_vec2(sh, "relpos", &relpos)); + + switch (img->repr.alpha) { + case PL_ALPHA_UNKNOWN: + GLSL("color.a = border; \n"); + img->repr.alpha = PL_ALPHA_INDEPENDENT; + img->comps = 4; + break; + case PL_ALPHA_INDEPENDENT: + GLSL("color.a *= border; \n"); + break; + case PL_ALPHA_PREMULTIPLIED: + GLSL("color *= border; \n"); + break; + case PL_ALPHA_MODE_COUNT: + pl_unreachable(); + } + } + + const struct pl_plane *ref = &target->planes[pass->dst_ref]; + pl_rect2d dst_rect = pass->dst_rect; + if (params->distort_params) { + struct pl_distort_params dpars = *params->distort_params; + if (dpars.alpha_mode) { + pl_shader_set_alpha(sh, &img->repr, dpars.alpha_mode); + img->repr.alpha = dpars.alpha_mode; + img->comps = 4; + } + pl_tex tex = img_tex(pass, img); + if (!tex) + return false; + // Expand canvas to fit result of distortion + const float ar = pl_rect2df_aspect(&target->crop); + const float sx = fminf(ar, 1.0f); + const float sy = fminf(1.0f / ar, 1.0f); + pl_rect2df bb = pl_transform2x2_bounds(&dpars.transform, &(pl_rect2df) { + .x0 = -sx, .x1 = sx, + .y0 = -sy, .y1 = sy, + }); + + // Clamp to output size and adjust as needed when constraining output + pl_rect2df tmp = target->crop; + pl_rect2df_stretch(&tmp, pl_rect_w(bb) / (2*sx), pl_rect_h(bb) / (2*sy)); + const float tmp_w = pl_rect_w(tmp), tmp_h = pl_rect_h(tmp); + int canvas_w = ref->texture->params.w, + canvas_h = ref->texture->params.h; + if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) + PL_SWAP(canvas_w, canvas_h); + tmp.x0 = PL_CLAMP(tmp.x0, 0.0f, canvas_w); + tmp.x1 = PL_CLAMP(tmp.x1, 0.0f, canvas_w); + tmp.y0 = PL_CLAMP(tmp.y0, 0.0f, canvas_h); + tmp.y1 = PL_CLAMP(tmp.y1, 0.0f, canvas_h); + if (dpars.constrain) { + const float rx = pl_rect_w(tmp) / tmp_w; + const float ry = pl_rect_h(tmp) / tmp_h; + pl_rect2df_stretch(&tmp, fminf(ry / rx, 1.0f), fminf(rx / ry, 1.0f)); + } + dst_rect.x0 = roundf(tmp.x0); + dst_rect.x1 = roundf(tmp.x1); + dst_rect.y0 = roundf(tmp.y0); + dst_rect.y1 = roundf(tmp.y1); + dpars.unscaled = true; + img->w = abs(pl_rect_w(dst_rect)); + img->h = abs(pl_rect_h(dst_rect)); + img->tex = NULL; + img->sh = sh = pl_dispatch_begin(rr->dp); + pl_shader_distort(sh, tex, img->w, img->h, &dpars); + } + + pass_hook(pass, img, PL_HOOK_PRE_OUTPUT); + + bool need_blend = params->blend_against_tiles || + (!target->repr.alpha && !params->blend_params); + if (img->comps == 4 && need_blend) { + if (params->blend_against_tiles) { + static const float zero[2][3] = {0}; + const float (*color)[3] = params->tile_colors; + if (memcmp(color, zero, sizeof(zero)) == 0) + color = pl_render_default_params.tile_colors; + int size = PL_DEF(params->tile_size, pl_render_default_params.tile_size); + GLSLH("#define bg_tile_a vec3("$", "$", "$") \n", + SH_FLOAT(color[0][0]), SH_FLOAT(color[0][1]), SH_FLOAT(color[0][2])); + GLSLH("#define bg_tile_b vec3("$", "$", "$") \n", + SH_FLOAT(color[1][0]), SH_FLOAT(color[1][1]), SH_FLOAT(color[1][2])); + GLSL("vec2 outcoord = gl_FragCoord.xy * "$"; \n" + "bvec2 tile = lessThan(fract(outcoord), vec2(0.5)); \n" + "vec3 bg_color = tile.x == tile.y ? bg_tile_a : bg_tile_b; \n", + SH_FLOAT(1.0 / size)); + } else { + GLSLH("#define bg_color vec3("$", "$", "$") \n", + SH_FLOAT(params->background_color[0]), + SH_FLOAT(params->background_color[1]), + SH_FLOAT(params->background_color[2])); + } + + pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_PREMULTIPLIED); + GLSL("color = vec4(color.rgb + bg_color * (1.0 - color.a), 1.0); \n"); + img->repr.alpha = PL_ALPHA_UNKNOWN; + img->comps = 3; + } + + // Apply the color scale separately, after encoding is done, to make sure + // that the intermediate FBO (if any) has the correct precision. + struct pl_color_repr repr = target->repr; + float scale = pl_color_repr_normalize(&repr); + enum pl_lut_type lut_type = guess_frame_lut_type(target, true); + if (lut_type != PL_LUT_CONVERSION) + pl_shader_encode_color(sh, &repr); + if (lut_type == PL_LUT_NATIVE) { + pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_INDEPENDENT); + pl_shader_custom_lut(sh, target->lut, &rr->lut_state[LUT_TARGET]); + pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_PREMULTIPLIED); + } + + // Rotation handling + if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) { + PL_SWAP(dst_rect.x0, dst_rect.y0); + PL_SWAP(dst_rect.x1, dst_rect.y1); + PL_SWAP(img->w, img->h); + sh->transpose = true; + } + + pass_hook(pass, img, PL_HOOK_OUTPUT); + sh = NULL; + + bool flipped_x = dst_rect.x1 < dst_rect.x0, + flipped_y = dst_rect.y1 < dst_rect.y0; + + if (!params->skip_target_clearing && pl_frame_is_cropped(target)) + pl_frame_clear_rgba(rr->gpu, target, CLEAR_COL(params)); + + for (int p = 0; p < target->num_planes; p++) { + const struct pl_plane *plane = &target->planes[p]; + float rx = (float) plane->texture->params.w / ref->texture->params.w, + ry = (float) plane->texture->params.h / ref->texture->params.h; + + // Only accept integer scaling ratios. This accounts for the fact + // that fractionally subsampled planes get rounded up to the + // nearest integer size, which we want to over-render. + float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx), + rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry); + float sx = plane->shift_x, sy = plane->shift_y; + + pl_rect2df plane_rectf = { + .x0 = (dst_rect.x0 - sx) * rrx, + .y0 = (dst_rect.y0 - sy) * rry, + .x1 = (dst_rect.x1 - sx) * rrx, + .y1 = (dst_rect.y1 - sy) * rry, + }; + + // Normalize to make the math easier + pl_rect2df_normalize(&plane_rectf); + + // Round the output rect + int rx0 = floorf(plane_rectf.x0), ry0 = floorf(plane_rectf.y0), + rx1 = ceilf(plane_rectf.x1), ry1 = ceilf(plane_rectf.y1); + + PL_TRACE(rr, "Subsampled target %d: {%f %f %f %f} -> {%d %d %d %d}", + p, plane_rectf.x0, plane_rectf.y0, + plane_rectf.x1, plane_rectf.y1, + rx0, ry0, rx1, ry1); + + if (target->num_planes > 1) { + + // Planar output, so we need to sample from an intermediate FBO + struct pl_sample_src src = { + .tex = img_tex(pass, img), + .new_w = rx1 - rx0, + .new_h = ry1 - ry0, + .rect = { + .x0 = (rx0 - plane_rectf.x0) / rrx, + .x1 = (rx1 - plane_rectf.x0) / rrx, + .y0 = (ry0 - plane_rectf.y0) / rry, + .y1 = (ry1 - plane_rectf.y0) / rry, + }, + }; + + if (!src.tex) { + PL_ERR(rr, "Output requires multiple planes, but FBOs are " + "unavailable. This combination is unsupported."); + return false; + } + + PL_TRACE(rr, "Sampling %dx%d img aligned from {%f %f %f %f}", + pass->img.w, pass->img.h, + src.rect.x0, src.rect.y0, + src.rect.x1, src.rect.y1); + + for (int c = 0; c < plane->components; c++) { + if (plane->component_mapping[c] < 0) + continue; + src.component_mask |= 1 << plane->component_mapping[c]; + } + + sh = pl_dispatch_begin(rr->dp); + dispatch_sampler(pass, sh, &rr->samplers_dst[p], SAMPLER_PLANE, + plane->texture, &src); + + } else { + + // Single plane, so we can directly re-use the img shader unless + // it's incompatible with the FBO capabilities + bool is_comp = pl_shader_is_compute(img_sh(pass, img)); + if (is_comp && !plane->texture->params.storable) { + if (!img_tex(pass, img)) { + PL_ERR(rr, "Rendering requires compute shaders, but output " + "is not storable, and FBOs are unavailable. This " + "combination is unsupported."); + return false; + } + } + + sh = img_sh(pass, img); + img->sh = NULL; + + } + + // Ignore dithering for > 16-bit outputs by default, since it makes + // little sense to do so (and probably just adds errors) + int depth = target->repr.bits.color_depth, applied_dither = 0; + if (depth && (depth < 16 || params->force_dither)) { + if (pass_error_diffusion(pass, &sh, depth, plane->components, + rx1 - rx0, ry1 - ry0)) + { + applied_dither = depth; + } else if (params->dither_params) { + struct pl_dither_params dparams = *params->dither_params; + if (!params->disable_dither_gamma_correction) + dparams.transfer = target->color.transfer; + pl_shader_dither(sh, depth, &rr->dither_state, &dparams); + applied_dither = depth; + } + } + + if (applied_dither != rr->prev_dither) { + if (applied_dither) { + PL_INFO(rr, "Dithering to %d bit depth", applied_dither); + } else { + PL_INFO(rr, "Dithering disabled"); + } + rr->prev_dither = applied_dither; + } + + GLSL("color *= vec4(1.0 / "$"); \n", SH_FLOAT(scale)); + swizzle_color(sh, plane->components, plane->component_mapping, + params->blend_params); + + pl_rect2d plane_rect = { + .x0 = flipped_x ? rx1 : rx0, + .x1 = flipped_x ? rx0 : rx1, + .y0 = flipped_y ? ry1 : ry0, + .y1 = flipped_y ? ry0 : ry1, + }; + + pl_transform2x2 tscale = { + .mat = {{{ rrx, 0.0 }, { 0.0, rry }}}, + .c = { -sx, -sy }, + }; + + if (plane->flipped) { + int plane_h = rry * ref->texture->params.h; + plane_rect.y0 = plane_h - plane_rect.y0; + plane_rect.y1 = plane_h - plane_rect.y1; + tscale.mat.m[1][1] = -tscale.mat.m[1][1]; + tscale.c[1] += plane->texture->params.h; + } + + bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params( + .shader = &sh, + .target = plane->texture, + .blend_params = params->blend_params, + .rect = plane_rect, + )); + + if (!ok) + return false; + + if (pass->info.stage != PL_RENDER_STAGE_BLEND) { + draw_overlays(pass, plane->texture, plane->components, + plane->component_mapping, image->overlays, + image->num_overlays, target->color, target->repr, + &tscale); + } + + draw_overlays(pass, plane->texture, plane->components, + plane->component_mapping, target->overlays, + target->num_overlays, target->color, target->repr, + &tscale); + } + + *img = (struct img) {0}; + return true; +} + +#define require(expr) pl_require(rr, expr) +#define validate_plane(plane, param) \ + do { \ + require((plane).texture); \ + require((plane).texture->params.param); \ + require((plane).components > 0 && (plane).components <= 4); \ + for (int c = 0; c < (plane).components; c++) { \ + require((plane).component_mapping[c] >= PL_CHANNEL_NONE && \ + (plane).component_mapping[c] <= PL_CHANNEL_A); \ + } \ + } while (0) + +#define validate_overlay(overlay) \ + do { \ + require((overlay).tex); \ + require((overlay).tex->params.sampleable); \ + require((overlay).num_parts >= 0); \ + for (int n = 0; n < (overlay).num_parts; n++) { \ + const struct pl_overlay_part *p = &(overlay).parts[n]; \ + require(pl_rect_w(p->dst) && pl_rect_h(p->dst)); \ + } \ + } while (0) + +#define validate_deinterlace_ref(image, ref) \ + do { \ + require((image)->num_planes == (ref)->num_planes); \ + const struct pl_tex_params *imgp, *refp; \ + for (int p = 0; p < (image)->num_planes; p++) { \ + validate_plane((ref)->planes[p], sampleable); \ + imgp = &(image)->planes[p].texture->params; \ + refp = &(ref)->planes[p].texture->params; \ + require(imgp->w == refp->w); \ + require(imgp->h == refp->h); \ + require(imgp->format->num_components == refp->format->num_components);\ + } \ + } while (0) + +// Perform some basic validity checks on incoming structs to help catch invalid +// API usage. This is not an exhaustive check. In particular, enums are not +// bounds checked. This is because most functions accepting enums already +// abort() in the default case, and because it's not the intent of this check +// to catch all instances of memory corruption - just common logic bugs. +static bool validate_structs(pl_renderer rr, + const struct pl_frame *image, + const struct pl_frame *target) +{ + // Rendering to/from a frame with no planes is technically allowed, but so + // pointless that it's more likely to be a user error worth catching. + require(target->num_planes > 0 && target->num_planes <= PL_MAX_PLANES); + for (int i = 0; i < target->num_planes; i++) + validate_plane(target->planes[i], renderable); + require(!pl_rect_w(target->crop) == !pl_rect_h(target->crop)); + require(target->num_overlays >= 0); + for (int i = 0; i < target->num_overlays; i++) + validate_overlay(target->overlays[i]); + + if (!image) + return true; + + require(image->num_planes > 0 && image->num_planes <= PL_MAX_PLANES); + for (int i = 0; i < image->num_planes; i++) + validate_plane(image->planes[i], sampleable); + require(!pl_rect_w(image->crop) == !pl_rect_h(image->crop)); + require(image->num_overlays >= 0); + for (int i = 0; i < image->num_overlays; i++) + validate_overlay(image->overlays[i]); + + if (image->field != PL_FIELD_NONE) { + require(image->first_field != PL_FIELD_NONE); + if (image->prev) + validate_deinterlace_ref(image, image->prev); + if (image->next) + validate_deinterlace_ref(image, image->next); + } + + return true; + +error: + return false; +} + +// returns index +static int frame_ref(const struct pl_frame *frame) +{ + pl_assert(frame->num_planes); + for (int i = 0; i < frame->num_planes; i++) { + switch (detect_plane_type(&frame->planes[i], &frame->repr)) { + case PLANE_RGB: + case PLANE_LUMA: + case PLANE_XYZ: + return i; + case PLANE_CHROMA: + case PLANE_ALPHA: + continue; + case PLANE_INVALID: + pl_unreachable(); + } + } + + return 0; +} + +static void fix_refs_and_rects(struct pass_state *pass) +{ + struct pl_frame *target = &pass->target; + pl_rect2df *dst = &target->crop; + pass->dst_ref = frame_ref(target); + pl_tex dst_ref = target->planes[pass->dst_ref].texture; + int dst_w = dst_ref->params.w, dst_h = dst_ref->params.h; + + if ((!dst->x0 && !dst->x1) || (!dst->y0 && !dst->y1)) { + dst->x1 = dst_w; + dst->y1 = dst_h; + } + + if (pass->src_ref < 0) { + // Simplified version of the below code which only rounds the target + // rect but doesn't retroactively apply the crop to the image + pass->rotation = pl_rotation_normalize(-target->rotation); + pl_rect2df_rotate(dst, -pass->rotation); + if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) + PL_SWAP(dst_w, dst_h); + + *dst = (pl_rect2df) { + .x0 = roundf(PL_CLAMP(dst->x0, 0.0, dst_w)), + .y0 = roundf(PL_CLAMP(dst->y0, 0.0, dst_h)), + .x1 = roundf(PL_CLAMP(dst->x1, 0.0, dst_w)), + .y1 = roundf(PL_CLAMP(dst->y1, 0.0, dst_h)), + }; + + pass->dst_rect = (pl_rect2d) { + dst->x0, dst->y0, dst->x1, dst->y1, + }; + + return; + } + + struct pl_frame *image = &pass->image; + pl_rect2df *src = &image->crop; + pass->src_ref = frame_ref(image); + pl_tex src_ref = image->planes[pass->src_ref].texture; + + if ((!src->x0 && !src->x1) || (!src->y0 && !src->y1)) { + src->x1 = src_ref->params.w; + src->y1 = src_ref->params.h; + }; + + // Compute end-to-end rotation + pass->rotation = pl_rotation_normalize(image->rotation - target->rotation); + pl_rect2df_rotate(dst, -pass->rotation); // normalize by counter-rotating + if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) + PL_SWAP(dst_w, dst_h); + + // Keep track of whether the end-to-end rendering is flipped + bool flipped_x = (src->x0 > src->x1) != (dst->x0 > dst->x1), + flipped_y = (src->y0 > src->y1) != (dst->y0 > dst->y1); + + // Normalize both rects to make the math easier + pl_rect2df_normalize(src); + pl_rect2df_normalize(dst); + + // Round the output rect and clip it to the framebuffer dimensions + float rx0 = roundf(PL_CLAMP(dst->x0, 0.0, dst_w)), + ry0 = roundf(PL_CLAMP(dst->y0, 0.0, dst_h)), + rx1 = roundf(PL_CLAMP(dst->x1, 0.0, dst_w)), + ry1 = roundf(PL_CLAMP(dst->y1, 0.0, dst_h)); + + // Adjust the src rect corresponding to the rounded crop + float scale_x = pl_rect_w(*src) / pl_rect_w(*dst), + scale_y = pl_rect_h(*src) / pl_rect_h(*dst), + base_x = src->x0, + base_y = src->y0; + + src->x0 = base_x + (rx0 - dst->x0) * scale_x; + src->x1 = base_x + (rx1 - dst->x0) * scale_x; + src->y0 = base_y + (ry0 - dst->y0) * scale_y; + src->y1 = base_y + (ry1 - dst->y0) * scale_y; + + // Update dst_rect to the rounded values and re-apply flip if needed. We + // always do this in the `dst` rather than the `src`` because this allows + // e.g. polar sampling compute shaders to work. + *dst = (pl_rect2df) { + .x0 = flipped_x ? rx1 : rx0, + .y0 = flipped_y ? ry1 : ry0, + .x1 = flipped_x ? rx0 : rx1, + .y1 = flipped_y ? ry0 : ry1, + }; + + // Copies of the above, for convenience + pass->ref_rect = *src; + pass->dst_rect = (pl_rect2d) { + dst->x0, dst->y0, dst->x1, dst->y1, + }; +} + +static void fix_frame(struct pl_frame *frame) +{ + pl_tex tex = frame->planes[frame_ref(frame)].texture; + + if (frame->repr.sys == PL_COLOR_SYSTEM_XYZ) { + // XYZ is implicity converted to linear DCI-P3 in pl_color_repr_decode + frame->color.primaries = PL_COLOR_PRIM_DCI_P3; + frame->color.transfer = PL_COLOR_TRC_ST428; + } + + // If the primaries are not known, guess them based on the resolution + if (tex && !frame->color.primaries) + frame->color.primaries = pl_color_primaries_guess(tex->params.w, tex->params.h); + + // For UNORM formats, we can infer the sampled bit depth from the texture + // itself. This is ignored for other format types, because the logic + // doesn't really work out for them anyways, and it's best not to do + // anything too crazy unless the user provides explicit details. + struct pl_bit_encoding *bits = &frame->repr.bits; + if (!bits->sample_depth && tex && tex->params.format->type == PL_FMT_UNORM) { + // Just assume the first component's depth is canonical. This works in + // practice, since for cases like rgb565 we want to use the lower depth + // anyway. Plus, every format has at least one component. + bits->sample_depth = tex->params.format->component_depth[0]; + + // If we don't know the color depth, assume it spans the full range of + // the texture. Otherwise, clamp it to the texture depth. + bits->color_depth = PL_DEF(bits->color_depth, bits->sample_depth); + bits->color_depth = PL_MIN(bits->color_depth, bits->sample_depth); + + // If the texture depth is higher than the known color depth, assume + // the colors were left-shifted. + bits->bit_shift += bits->sample_depth - bits->color_depth; + } +} + +static bool acquire_frame(struct pass_state *pass, struct pl_frame *frame, + bool *acquired) +{ + if (!frame || !frame->acquire || *acquired) + return true; + + *acquired = true; + return frame->acquire(pass->rr->gpu, frame); +} + +static void release_frame(struct pass_state *pass, struct pl_frame *frame, + bool *acquired) +{ + if (frame && frame->release && *acquired) + frame->release(pass->rr->gpu, frame); + *acquired = false; +} + +static void pass_uninit(struct pass_state *pass) +{ + pl_renderer rr = pass->rr; + pl_dispatch_abort(rr->dp, &pass->img.sh); + release_frame(pass, &pass->next, &pass->acquired.next); + release_frame(pass, &pass->prev, &pass->acquired.prev); + release_frame(pass, &pass->image, &pass->acquired.image); + release_frame(pass, &pass->target, &pass->acquired.target); + pl_free_ptr(&pass->tmp); +} + +static void icc_fallback(struct pass_state *pass, struct pl_frame *frame, + struct icc_state *fallback) +{ + if (!frame || frame->icc || !frame->profile.data) + return; + + // Don't re-attempt opening already failed profiles + if (fallback->error && fallback->error == frame->profile.signature) + return; + +#ifdef PL_HAVE_LCMS + pl_renderer rr = pass->rr; + if (pl_icc_update(rr->log, &fallback->icc, &frame->profile, NULL)) { + frame->icc = fallback->icc; + } else { + PL_WARN(rr, "Failed opening ICC profile... ignoring"); + fallback->error = frame->profile.signature; + } +#endif +} + +static void pass_fix_frames(struct pass_state *pass) +{ + pl_renderer rr = pass->rr; + struct pl_frame *image = pass->src_ref < 0 ? NULL : &pass->image; + struct pl_frame *target = &pass->target; + + fix_refs_and_rects(pass); + + // Fallback for older ICC profile API + icc_fallback(pass, image, &rr->icc_fallback[ICC_IMAGE]); + icc_fallback(pass, target, &rr->icc_fallback[ICC_TARGET]); + + // Force colorspace metadata to ICC profile values, if present + if (image && image->icc) { + image->color.primaries = image->icc->containing_primaries; + image->color.hdr = image->icc->csp.hdr; + } + + if (target->icc) { + target->color.primaries = target->icc->containing_primaries; + target->color.hdr = target->icc->csp.hdr; + } + + // Infer the target color space info based on the image's + if (image) { + fix_frame(image); + pl_color_space_infer_map(&image->color, &target->color); + fix_frame(target); // do this only after infer_map + } else { + fix_frame(target); + pl_color_space_infer(&target->color); + } + + // Detect the presence of an alpha channel in the frames and explicitly + // default the alpha mode in this case, so we can use it to detect whether + // or not to strip the alpha channel during rendering. + // + // Note the different defaults for the image and target, because files + // are usually independent but windowing systems usually expect + // premultiplied. (We also premultiply for internal rendering, so this + // way of doing it avoids a possible division-by-zero path!) + if (image && !image->repr.alpha) { + for (int i = 0; i < image->num_planes; i++) { + const struct pl_plane *plane = &image->planes[i]; + for (int c = 0; c < plane->components; c++) { + if (plane->component_mapping[c] == PL_CHANNEL_A) + image->repr.alpha = PL_ALPHA_INDEPENDENT; + } + } + } + + if (!target->repr.alpha) { + for (int i = 0; i < target->num_planes; i++) { + const struct pl_plane *plane = &target->planes[i]; + for (int c = 0; c < plane->components; c++) { + if (plane->component_mapping[c] == PL_CHANNEL_A) + target->repr.alpha = PL_ALPHA_PREMULTIPLIED; + } + } + } +} + +void pl_frames_infer(pl_renderer rr, struct pl_frame *image, + struct pl_frame *target) +{ + struct pass_state pass = { + .rr = rr, + .image = *image, + .target = *target, + }; + + pass_fix_frames(&pass); + *image = pass.image; + *target = pass.target; +} + +static bool pass_init(struct pass_state *pass, bool acquire_image) +{ + struct pl_frame *image = pass->src_ref < 0 ? NULL : &pass->image; + struct pl_frame *target = &pass->target; + + if (!acquire_frame(pass, target, &pass->acquired.target)) + goto error; + if (acquire_image && image) { + if (!acquire_frame(pass, image, &pass->acquired.image)) + goto error; + + const struct pl_render_params *params = pass->params; + const struct pl_deinterlace_params *deint = params->deinterlace_params; + bool needs_refs = image->field != PL_FIELD_NONE && deint && + pl_deinterlace_needs_refs(deint->algo); + + if (image->prev && needs_refs) { + // Move into local copy so we can acquire/release it + pass->prev = *image->prev; + image->prev = &pass->prev; + if (!acquire_frame(pass, &pass->prev, &pass->acquired.prev)) + goto error; + } + if (image->next && needs_refs) { + pass->next = *image->next; + image->next = &pass->next; + if (!acquire_frame(pass, &pass->next, &pass->acquired.next)) + goto error; + } + } + + if (!validate_structs(pass->rr, acquire_image ? image : NULL, target)) + goto error; + + find_fbo_format(pass); + pass_fix_frames(pass); + + pass->tmp = pl_tmp(NULL); + return true; + +error: + pass_uninit(pass); + return false; +} + +static void pass_begin_frame(struct pass_state *pass) +{ + pl_renderer rr = pass->rr; + const struct pl_render_params *params = pass->params; + + pl_dispatch_callback(rr->dp, pass, info_callback); + pl_dispatch_reset_frame(rr->dp); + + for (int i = 0; i < params->num_hooks; i++) { + if (params->hooks[i]->reset) + params->hooks[i]->reset(params->hooks[i]->priv); + } + + size_t size = rr->fbos.num * sizeof(bool); + pass->fbos_used = pl_realloc(pass->tmp, pass->fbos_used, size); + memset(pass->fbos_used, 0, size); +} + +static bool draw_empty_overlays(pl_renderer rr, + const struct pl_frame *ptarget, + const struct pl_render_params *params) +{ + if (!params->skip_target_clearing) + pl_frame_clear_rgba(rr->gpu, ptarget, CLEAR_COL(params)); + + if (!ptarget->num_overlays) + return true; + + struct pass_state pass = { + .rr = rr, + .params = params, + .src_ref = -1, + .target = *ptarget, + .info.stage = PL_RENDER_STAGE_BLEND, + .info.count = 0, + }; + + if (!pass_init(&pass, false)) + return false; + + pass_begin_frame(&pass); + struct pl_frame *target = &pass.target; + pl_tex ref = target->planes[pass.dst_ref].texture; + for (int p = 0; p < target->num_planes; p++) { + const struct pl_plane *plane = &target->planes[p]; + // Math replicated from `pass_output_target` + float rx = (float) plane->texture->params.w / ref->params.w, + ry = (float) plane->texture->params.h / ref->params.h; + float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx), + rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry); + float sx = plane->shift_x, sy = plane->shift_y; + + pl_transform2x2 tscale = { + .mat = {{{ rrx, 0.0 }, { 0.0, rry }}}, + .c = { -sx, -sy }, + }; + + if (plane->flipped) { + tscale.mat.m[1][1] = -tscale.mat.m[1][1]; + tscale.c[1] += plane->texture->params.h; + } + + draw_overlays(&pass, plane->texture, plane->components, + plane->component_mapping, target->overlays, + target->num_overlays, target->color, target->repr, + &tscale); + } + + pass_uninit(&pass); + return true; +} + +bool pl_render_image(pl_renderer rr, const struct pl_frame *pimage, + const struct pl_frame *ptarget, + const struct pl_render_params *params) +{ + params = PL_DEF(params, &pl_render_default_params); + pl_dispatch_mark_dynamic(rr->dp, params->dynamic_constants); + if (!pimage) + return draw_empty_overlays(rr, ptarget, params); + + struct pass_state pass = { + .rr = rr, + .params = params, + .image = *pimage, + .target = *ptarget, + .info.stage = PL_RENDER_STAGE_FRAME, + }; + + if (!pass_init(&pass, true)) + return false; + + // No-op (empty crop) + if (!pl_rect_w(pass.dst_rect) || !pl_rect_h(pass.dst_rect)) { + pass_uninit(&pass); + return draw_empty_overlays(rr, ptarget, params); + } + + pass_begin_frame(&pass); + if (!pass_read_image(&pass)) + goto error; + if (!pass_scale_main(&pass)) + goto error; + pass_convert_colors(&pass); + if (!pass_output_target(&pass)) + goto error; + + pass_uninit(&pass); + return true; + +error: + PL_ERR(rr, "Failed rendering image!"); + pass_uninit(&pass); + return false; +} + +const struct pl_frame *pl_frame_mix_current(const struct pl_frame_mix *mix) +{ + const struct pl_frame *cur = NULL; + for (int i = 0; i < mix->num_frames; i++) { + if (mix->timestamps[i] > 0.0f) + break; + cur = mix->frames[i]; + } + + return cur; +} + +const struct pl_frame *pl_frame_mix_nearest(const struct pl_frame_mix *mix) +{ + if (!mix->num_frames) + return NULL; + + const struct pl_frame *best = mix->frames[0]; + float best_dist = fabsf(mix->timestamps[0]); + for (int i = 1; i < mix->num_frames; i++) { + float dist = fabsf(mix->timestamps[i]); + if (dist < best_dist) { + best = mix->frames[i]; + best_dist = dist; + continue; + } else { + break; + } + } + + return best; +} + +struct params_info { + uint64_t hash; + bool trivial; +}; + +static struct params_info render_params_info(const struct pl_render_params *params_orig) +{ + struct pl_render_params params = *params_orig; + struct params_info info = { + .trivial = true, + .hash = 0, + }; + +#define HASH_PTR(ptr, def, ptr_trivial) \ + do { \ + if (ptr) { \ + pl_hash_merge(&info.hash, pl_mem_hash(ptr, sizeof(*ptr))); \ + info.trivial &= (ptr_trivial); \ + ptr = NULL; \ + } else if ((def) != NULL) { \ + pl_hash_merge(&info.hash, pl_mem_hash(def, sizeof(*ptr))); \ + } \ + } while (0) + +#define HASH_FILTER(scaler) \ + do { \ + if ((scaler == &pl_filter_bilinear || scaler == &pl_filter_nearest) && \ + params.skip_anti_aliasing) \ + { \ + /* treat as NULL */ \ + } else if (scaler) { \ + struct pl_filter_config filter = *scaler; \ + HASH_PTR(filter.kernel, NULL, false); \ + HASH_PTR(filter.window, NULL, false); \ + pl_hash_merge(&info.hash, pl_var_hash(filter)); \ + scaler = NULL; \ + } \ + } while (0) + + HASH_FILTER(params.upscaler); + HASH_FILTER(params.downscaler); + + HASH_PTR(params.deband_params, NULL, false); + HASH_PTR(params.sigmoid_params, NULL, false); + HASH_PTR(params.deinterlace_params, NULL, false); + HASH_PTR(params.cone_params, NULL, true); + HASH_PTR(params.icc_params, &pl_icc_default_params, true); + HASH_PTR(params.color_adjustment, &pl_color_adjustment_neutral, true); + HASH_PTR(params.color_map_params, &pl_color_map_default_params, true); + HASH_PTR(params.peak_detect_params, NULL, false); + + // Hash all hooks + for (int i = 0; i < params.num_hooks; i++) { + const struct pl_hook *hook = params.hooks[i]; + if (hook->stages == PL_HOOK_OUTPUT) + continue; // ignore hooks only relevant to pass_output_target + pl_hash_merge(&info.hash, pl_var_hash(*hook)); + info.trivial = false; + } + params.hooks = NULL; + + // Hash the LUT by only looking at the signature + if (params.lut) { + pl_hash_merge(&info.hash, params.lut->signature); + info.trivial = false; + params.lut = NULL; + } + +#define CLEAR(field) field = (__typeof__(field)) {0} + + // Clear out fields only relevant to pl_render_image_mix + CLEAR(params.frame_mixer); + CLEAR(params.preserve_mixing_cache); + CLEAR(params.skip_caching_single_frame); + memset(params.background_color, 0, sizeof(params.background_color)); + CLEAR(params.background_transparency); + CLEAR(params.skip_target_clearing); + CLEAR(params.blend_against_tiles); + memset(params.tile_colors, 0, sizeof(params.tile_colors)); + CLEAR(params.tile_size); + + // Clear out fields only relevant to pass_output_target + CLEAR(params.blend_params); + CLEAR(params.distort_params); + CLEAR(params.dither_params); + CLEAR(params.error_diffusion); + CLEAR(params.force_dither); + CLEAR(params.corner_rounding); + + // Clear out other irrelevant fields + CLEAR(params.dynamic_constants); + CLEAR(params.info_callback); + CLEAR(params.info_priv); + + pl_hash_merge(&info.hash, pl_var_hash(params)); + return info; +} + +#define MAX_MIX_FRAMES 16 + +bool pl_render_image_mix(pl_renderer rr, const struct pl_frame_mix *images, + const struct pl_frame *ptarget, + const struct pl_render_params *params) +{ + if (!images->num_frames) + return pl_render_image(rr, NULL, ptarget, params); + + params = PL_DEF(params, &pl_render_default_params); + struct params_info par_info = render_params_info(params); + pl_dispatch_mark_dynamic(rr->dp, params->dynamic_constants); + + require(images->num_frames >= 1); + require(images->vsync_duration > 0.0); + for (int i = 0; i < images->num_frames - 1; i++) + require(images->timestamps[i] <= images->timestamps[i+1]); + + const struct pl_frame *refimg = pl_frame_mix_nearest(images); + struct pass_state pass = { + .rr = rr, + .params = params, + .image = *refimg, + .target = *ptarget, + .info.stage = PL_RENDER_STAGE_BLEND, + }; + + if (rr->errors & PL_RENDER_ERR_FRAME_MIXING) + goto fallback; + if (!pass_init(&pass, false)) + return false; + if (!pass.fbofmt[4]) + goto fallback; + + const struct pl_frame *target = &pass.target; + int out_w = abs(pl_rect_w(pass.dst_rect)), + out_h = abs(pl_rect_h(pass.dst_rect)); + if (!out_w || !out_h) + goto fallback; + + int fidx = 0; + struct cached_frame frames[MAX_MIX_FRAMES]; + float weights[MAX_MIX_FRAMES]; + float wsum = 0.0; + + // Garbage collect the cache by evicting all frames from the cache that are + // not determined to still be required + for (int i = 0; i < rr->frames.num; i++) + rr->frames.elem[i].evict = true; + + // Blur frame mixer according to vsync ratio (source / display) + struct pl_filter_config mixer; + if (params->frame_mixer) { + mixer = *params->frame_mixer; + mixer.blur = PL_DEF(mixer.blur, 1.0); + for (int i = 1; i < images->num_frames; i++) { + if (images->timestamps[i] >= 0.0 && images->timestamps[i - 1] < 0) { + float frame_dur = images->timestamps[i] - images->timestamps[i - 1]; + if (images->vsync_duration > frame_dur && !params->skip_anti_aliasing) + mixer.blur *= images->vsync_duration / frame_dur; + break; + } + } + } + + // Traverse the input frames and determine/prepare the ones we need + bool single_frame = !params->frame_mixer || images->num_frames == 1; +retry: + for (int i = 0; i < images->num_frames; i++) { + uint64_t sig = images->signatures[i]; + float rts = images->timestamps[i]; + const struct pl_frame *img = images->frames[i]; + PL_TRACE(rr, "Considering image with signature 0x%llx, rts %f", + (unsigned long long) sig, rts); + + // Combining images with different rotations is basically unfeasible + if (pl_rotation_normalize(img->rotation - refimg->rotation)) { + PL_TRACE(rr, " -> Skipping: incompatible rotation"); + continue; + } + + float weight; + if (single_frame) { + + // Only render the refimg, ignore others + if (img == refimg) { + weight = 1.0; + } else { + PL_TRACE(rr, " -> Skipping: no frame mixer"); + continue; + } + + // For backwards compatibility, treat !kernel as oversample + } else if (!mixer.kernel || mixer.kernel == &pl_filter_function_oversample) { + + // Compute the visible interval [rts, end] of this frame + float end = i+1 < images->num_frames ? images->timestamps[i+1] : INFINITY; + if (rts > images->vsync_duration || end < 0.0) { + PL_TRACE(rr, " -> Skipping: no intersection with vsync"); + continue; + } else { + rts = PL_MAX(rts, 0.0); + end = PL_MIN(end, images->vsync_duration); + pl_assert(end >= rts); + } + + // Weight is the fraction of vsync interval that frame is visible + weight = (end - rts) / images->vsync_duration; + PL_TRACE(rr, " -> Frame [%f, %f] intersects [%f, %f] = weight %f", + rts, end, 0.0, images->vsync_duration, weight); + + if (weight < mixer.kernel->params[0]) { + PL_TRACE(rr, " (culling due to threshold)"); + weight = 0.0; + } + + } else { + + const float radius = pl_filter_radius_bound(&mixer); + if (fabsf(rts) >= radius) { + PL_TRACE(rr, " -> Skipping: outside filter radius (%f)", radius); + continue; + } + + // Weight is directly sampled from the filter + weight = pl_filter_sample(&mixer, rts); + PL_TRACE(rr, " -> Filter offset %f = weight %f", rts, weight); + + } + + struct cached_frame *f = NULL; + for (int j = 0; j < rr->frames.num; j++) { + if (rr->frames.elem[j].signature == sig) { + f = &rr->frames.elem[j]; + f->evict = false; + break; + } + } + + // Skip frames with negligible contributions. Do this after the loop + // above to make sure these frames don't get evicted just yet, and + // also exclude the reference image from this optimization to ensure + // that we always have at least one frame. + const float cutoff = 1e-3; + if (fabsf(weight) <= cutoff && img != refimg) { + PL_TRACE(rr, " -> Skipping: weight (%f) below threshold (%f)", + weight, cutoff); + continue; + } + + bool skip_cache = single_frame && (params->skip_caching_single_frame || par_info.trivial); + if (!f && skip_cache) { + PL_TRACE(rr, "Single frame not found in cache, bypassing"); + goto fallback; + } + + if (!f) { + // Signature does not exist in the cache at all yet, + // so grow the cache by this entry. + PL_ARRAY_GROW(rr, rr->frames); + f = &rr->frames.elem[rr->frames.num++]; + *f = (struct cached_frame) { + .signature = sig, + }; + } + + // Check to see if we can blindly reuse this cache entry. This is the + // case if either the params are compatible, or the user doesn't care + bool can_reuse = f->tex; + bool strict_reuse = skip_cache || single_frame || + !params->preserve_mixing_cache; + if (can_reuse && strict_reuse) { + can_reuse = f->tex->params.w == out_w && + f->tex->params.h == out_h && + pl_rect2d_eq(f->crop, img->crop) && + f->params_hash == par_info.hash && + pl_color_space_equal(&f->color, &target->color) && + pl_icc_profile_equal(&f->profile, &target->profile); + } + + if (!can_reuse && skip_cache) { + PL_TRACE(rr, "Single frame cache entry invalid, bypassing"); + goto fallback; + } + + if (!can_reuse) { + // If we can't reuse the entry, we need to re-render this frame + PL_TRACE(rr, " -> Cached texture missing or invalid.. (re)creating"); + if (!f->tex) { + if (PL_ARRAY_POP(rr->frame_fbos, &f->tex)) + pl_tex_invalidate(rr->gpu, f->tex); + } + + bool ok = pl_tex_recreate(rr->gpu, &f->tex, pl_tex_params( + .w = out_w, + .h = out_h, + .format = pass.fbofmt[4], + .sampleable = true, + .renderable = true, + .blit_dst = pass.fbofmt[4]->caps & PL_FMT_CAP_BLITTABLE, + .storable = pass.fbofmt[4]->caps & PL_FMT_CAP_STORABLE, + )); + + if (!ok) { + PL_ERR(rr, "Could not create intermediate texture for " + "frame mixing.. disabling!"); + rr->errors |= PL_RENDER_ERR_FRAME_MIXING; + goto fallback; + } + + struct pass_state inter_pass = { + .rr = rr, + .params = pass.params, + .image = *img, + .target = *ptarget, + .info.stage = PL_RENDER_STAGE_FRAME, + .acquired = pass.acquired, + }; + + // Render a single frame up to `pass_output_target` + memcpy(inter_pass.fbofmt, pass.fbofmt, sizeof(pass.fbofmt)); + if (!pass_init(&inter_pass, true)) + goto fail; + + pass_begin_frame(&inter_pass); + if (!(ok = pass_read_image(&inter_pass))) + goto inter_pass_error; + if (!(ok = pass_scale_main(&inter_pass))) + goto inter_pass_error; + pass_convert_colors(&inter_pass); + + pl_assert(inter_pass.img.sh); // guaranteed by `pass_convert_colors` + pl_shader_set_alpha(inter_pass.img.sh, &inter_pass.img.repr, + PL_ALPHA_PREMULTIPLIED); // for frame mixing + + pl_assert(inter_pass.img.w == out_w && + inter_pass.img.h == out_h); + + ok = pl_dispatch_finish(rr->dp, pl_dispatch_params( + .shader = &inter_pass.img.sh, + .target = f->tex, + )); + if (!ok) + goto inter_pass_error; + + float sx = out_w / pl_rect_w(inter_pass.dst_rect), + sy = out_h / pl_rect_h(inter_pass.dst_rect); + + pl_transform2x2 shift = { + .mat.m = {{ sx, 0, }, { 0, sy, }}, + .c = { + -sx * inter_pass.dst_rect.x0, + -sy * inter_pass.dst_rect.y0 + }, + }; + + if (inter_pass.rotation % PL_ROTATION_180 == PL_ROTATION_90) { + PL_SWAP(shift.mat.m[0][0], shift.mat.m[0][1]); + PL_SWAP(shift.mat.m[1][0], shift.mat.m[1][1]); + } + + draw_overlays(&inter_pass, f->tex, inter_pass.img.comps, NULL, + inter_pass.image.overlays, + inter_pass.image.num_overlays, + inter_pass.img.color, + inter_pass.img.repr, + &shift); + + f->params_hash = par_info.hash; + f->crop = img->crop; + f->color = inter_pass.img.color; + f->comps = inter_pass.img.comps; + f->profile = target->profile; + // fall through + +inter_pass_error: + inter_pass.acquired.target = false; // don't release target + pass_uninit(&inter_pass); + if (!ok) + goto fail; + } + + pl_assert(fidx < MAX_MIX_FRAMES); + frames[fidx] = *f; + weights[fidx] = weight; + wsum += weight; + fidx++; + } + + // Evict the frames we *don't* need + for (int i = 0; i < rr->frames.num; ) { + if (rr->frames.elem[i].evict) { + PL_TRACE(rr, "Evicting frame with signature %llx from cache", + (unsigned long long) rr->frames.elem[i].signature); + PL_ARRAY_APPEND(rr, rr->frame_fbos, rr->frames.elem[i].tex); + PL_ARRAY_REMOVE_AT(rr->frames, i); + continue; + } else { + i++; + } + } + + // If we got back no frames, retry with ZOH semantics + if (!fidx) { + pl_assert(!single_frame); + single_frame = true; + goto retry; + } + + // Sample and mix the output color + pass_begin_frame(&pass); + pass.info.count = fidx; + pl_assert(fidx > 0); + + pl_shader sh = pl_dispatch_begin(rr->dp); + sh_describef(sh, "frame mixing (%d frame%s)", fidx, fidx > 1 ? "s" : ""); + sh->output = PL_SHADER_SIG_COLOR; + sh->output_w = out_w; + sh->output_h = out_h; + + GLSL("vec4 color; \n" + "// pl_render_image_mix \n" + "{ \n" + "vec4 mix_color = vec4(0.0); \n"); + + int comps = 0; + for (int i = 0; i < fidx; i++) { + const struct pl_tex_params *tpars = &frames[i].tex->params; + + // Use linear sampling if desired and possible + enum pl_tex_sample_mode sample_mode = PL_TEX_SAMPLE_NEAREST; + if ((tpars->w != out_w || tpars->h != out_h) && + (tpars->format->caps & PL_FMT_CAP_LINEAR)) + { + sample_mode = PL_TEX_SAMPLE_LINEAR; + } + + ident_t pos, tex = sh_bind(sh, frames[i].tex, PL_TEX_ADDRESS_CLAMP, + sample_mode, "frame", NULL, &pos, NULL); + + GLSL("color = textureLod("$", "$", 0.0); \n", tex, pos); + + // Note: This ignores differences in ICC profile, which we decide to + // just simply not care about. Doing that properly would require + // converting between different image profiles, and the headache of + // finagling that state is just not worth it because this is an + // exceptionally unlikely hypothetical. + // + // This also ignores differences in HDR metadata, which we deliberately + // ignore because it causes aggressive shader recompilation. + struct pl_color_space frame_csp = frames[i].color; + struct pl_color_space mix_csp = target->color; + frame_csp.hdr = mix_csp.hdr = (struct pl_hdr_metadata) {0}; + pl_shader_color_map_ex(sh, NULL, pl_color_map_args(frame_csp, mix_csp)); + + float weight = weights[i] / wsum; + GLSL("mix_color += vec4("$") * color; \n", SH_FLOAT_DYN(weight)); + comps = PL_MAX(comps, frames[i].comps); + } + + GLSL("color = mix_color; \n" + "} \n"); + + // Dispatch this to the destination + pass.img = (struct img) { + .sh = sh, + .w = out_w, + .h = out_h, + .comps = comps, + .color = target->color, + .repr = { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_PC, + .alpha = comps >= 4 ? PL_ALPHA_PREMULTIPLIED : PL_ALPHA_UNKNOWN, + }, + }; + + if (!pass_output_target(&pass)) + goto fallback; + + pass_uninit(&pass); + return true; + +fail: + PL_ERR(rr, "Could not render image for frame mixing.. disabling!"); + rr->errors |= PL_RENDER_ERR_FRAME_MIXING; + // fall through + +fallback: + pass_uninit(&pass); + return pl_render_image(rr, refimg, ptarget, params); + +error: // for parameter validation failures + return false; +} + +void pl_frames_infer_mix(pl_renderer rr, const struct pl_frame_mix *mix, + struct pl_frame *target, struct pl_frame *out_ref) +{ + struct pass_state pass = { + .rr = rr, + .target = *target, + }; + + const struct pl_frame *refimg = pl_frame_mix_nearest(mix); + if (refimg) { + pass.image = *refimg; + } else { + pass.src_ref = -1; + } + + pass_fix_frames(&pass); + *target = pass.target; + if (out_ref) + *out_ref = pass.image; +} + +void pl_frame_set_chroma_location(struct pl_frame *frame, + enum pl_chroma_location chroma_loc) +{ + pl_tex ref = frame->planes[frame_ref(frame)].texture; + + if (ref) { + // Texture dimensions are already known, so apply the chroma location + // only to subsampled planes + int ref_w = ref->params.w, ref_h = ref->params.h; + + for (int i = 0; i < frame->num_planes; i++) { + struct pl_plane *plane = &frame->planes[i]; + pl_tex tex = plane->texture; + bool subsampled = tex->params.w < ref_w || tex->params.h < ref_h; + if (subsampled) + pl_chroma_location_offset(chroma_loc, &plane->shift_x, &plane->shift_y); + } + } else { + // Texture dimensions are not yet known, so apply the chroma location + // to all chroma planes, regardless of subsampling + for (int i = 0; i < frame->num_planes; i++) { + struct pl_plane *plane = &frame->planes[i]; + if (detect_plane_type(plane, &frame->repr) == PLANE_CHROMA) + pl_chroma_location_offset(chroma_loc, &plane->shift_x, &plane->shift_y); + } + } +} + +void pl_frame_from_swapchain(struct pl_frame *out_frame, + const struct pl_swapchain_frame *frame) +{ + pl_tex fbo = frame->fbo; + int num_comps = fbo->params.format->num_components; + if (!frame->color_repr.alpha) + num_comps = PL_MIN(num_comps, 3); + + *out_frame = (struct pl_frame) { + .num_planes = 1, + .planes = {{ + .texture = fbo, + .flipped = frame->flipped, + .components = num_comps, + .component_mapping = {0, 1, 2, 3}, + }}, + .crop = { 0, 0, fbo->params.w, fbo->params.h }, + .repr = frame->color_repr, + .color = frame->color_space, + }; +} + +bool pl_frame_is_cropped(const struct pl_frame *frame) +{ + int x0 = roundf(PL_MIN(frame->crop.x0, frame->crop.x1)), + y0 = roundf(PL_MIN(frame->crop.y0, frame->crop.y1)), + x1 = roundf(PL_MAX(frame->crop.x0, frame->crop.x1)), + y1 = roundf(PL_MAX(frame->crop.y0, frame->crop.y1)); + + pl_tex ref = frame->planes[frame_ref(frame)].texture; + pl_assert(ref); + + if (!x0 && !x1) + x1 = ref->params.w; + if (!y0 && !y1) + y1 = ref->params.h; + + return x0 > 0 || y0 > 0 || x1 < ref->params.w || y1 < ref->params.h; +} + +void pl_frame_clear_rgba(pl_gpu gpu, const struct pl_frame *frame, + const float rgba[4]) +{ + struct pl_color_repr repr = frame->repr; + pl_transform3x3 tr = pl_color_repr_decode(&repr, NULL); + pl_transform3x3_invert(&tr); + + float encoded[3] = { rgba[0], rgba[1], rgba[2] }; + pl_transform3x3_apply(&tr, encoded); + + float mult = frame->repr.alpha == PL_ALPHA_PREMULTIPLIED ? rgba[3] : 1.0; + for (int p = 0; p < frame->num_planes; p++) { + const struct pl_plane *plane = &frame->planes[p]; + float clear[4] = { 0.0, 0.0, 0.0, rgba[3] }; + for (int c = 0; c < plane->components; c++) { + int ch = plane->component_mapping[c]; + if (ch >= 0 && ch < 3) + clear[c] = mult * encoded[plane->component_mapping[c]]; + } + + pl_tex_clear(gpu, plane->texture, clear); + } +} + +struct pl_render_errors pl_renderer_get_errors(pl_renderer rr) +{ + return (struct pl_render_errors) { + .errors = rr->errors, + .disabled_hooks = rr->disabled_hooks.elem, + .num_disabled_hooks = rr->disabled_hooks.num, + }; +} + +void pl_renderer_reset_errors(pl_renderer rr, + const struct pl_render_errors *errors) +{ + if (!errors) { + // Reset everything + rr->errors = PL_RENDER_ERR_NONE; + rr->disabled_hooks.num = 0; + return; + } + + // Reset only requested errors + rr->errors &= ~errors->errors; + + // Not clearing hooks + if (!(errors->errors & PL_RENDER_ERR_HOOKS)) + goto done; + + // Remove all hook signatures + if (!errors->num_disabled_hooks) { + rr->disabled_hooks.num = 0; + goto done; + } + + // At this point we require valid array of hooks + if (!errors->disabled_hooks) { + assert(errors->disabled_hooks); + goto done; + } + + for (int i = 0; i < errors->num_disabled_hooks; i++) { + for (int j = 0; j < rr->disabled_hooks.num; j++) { + // Remove only requested hook signatures + if (rr->disabled_hooks.elem[j] == errors->disabled_hooks[i]) { + PL_ARRAY_REMOVE_AT(rr->disabled_hooks, j); + break; + } + } + } + + done: + if (rr->disabled_hooks.num) + rr->errors |= PL_RENDER_ERR_HOOKS; + return; +} diff --git a/src/shaders.c b/src/shaders.c new file mode 100644 index 0000000..503ea78 --- /dev/null +++ b/src/shaders.c @@ -0,0 +1,992 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <math.h> + +#include "common.h" +#include "log.h" +#include "shaders.h" + +pl_shader_info pl_shader_info_ref(pl_shader_info pinfo) +{ + struct sh_info *info = (struct sh_info *) pinfo; + if (!info) + return NULL; + + pl_rc_ref(&info->rc); + return &info->info; +} + +void pl_shader_info_deref(pl_shader_info *pinfo) +{ + struct sh_info *info = (struct sh_info *) *pinfo; + if (!info) + return; + + if (pl_rc_deref(&info->rc)) + pl_free(info); + *pinfo = NULL; +} + +static struct sh_info *sh_info_alloc(void *alloc) +{ + struct sh_info *info = pl_zalloc_ptr(alloc, info); + info->tmp = pl_tmp(info); + pl_rc_init(&info->rc); + return info; +} + +// Re-use `sh_info` allocation if possible, allocate new otherwise +static struct sh_info *sh_info_recycle(struct sh_info *info) +{ + if (!pl_rc_deref(&info->rc)) + return sh_info_alloc(NULL); + + memset(&info->info, 0, sizeof(info->info)); // reset public fields + pl_free_children(info->tmp); + pl_rc_ref(&info->rc); + info->desc.len = 0; + info->steps.num = 0; + return info; +} + +static uint8_t reverse_bits(uint8_t x) +{ + static const uint8_t reverse_nibble[16] = { + 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe, + 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf, + }; + + return reverse_nibble[x & 0xF] << 4 | reverse_nibble[x >> 4]; +} + +static void init_shader(pl_shader sh, const struct pl_shader_params *params) +{ + if (params) { + sh->info->info.params = *params; + + // To avoid collisions for shaders with very high number of + // identifiers, pack the shader ID into the highest bits (MSB -> LSB) + pl_static_assert(sizeof(sh->prefix) > sizeof(params->id)); + const int shift = 8 * (sizeof(sh->prefix) - sizeof(params->id)); + sh->prefix = reverse_bits(params->id) << shift; + } + + sh->name = sh_fresh(sh, "main"); +} + +pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params) +{ + static const int glsl_ver_req = 130; + if (params && params->glsl.version && params->glsl.version < 130) { + pl_err(log, "Requested GLSL version %d too low (required: %d)", + params->glsl.version, glsl_ver_req); + return NULL; + } + + pl_shader sh = pl_alloc_ptr(NULL, sh); + *sh = (struct pl_shader_t) { + .log = log, + .tmp = pl_tmp(sh), + .info = sh_info_alloc(NULL), + .mutable = true, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(sh->buffers); i++) + sh->buffers[i] = pl_str_builder_alloc(sh); + + init_shader(sh, params); + return sh; +} + +static void sh_obj_deref(pl_shader_obj obj); + +void sh_deref(pl_shader sh) +{ + pl_free_children(sh->tmp); + + for (int i = 0; i < sh->obj.num; i++) + sh_obj_deref(sh->obj.elem[i]); + sh->obj.num = 0; +} + +void pl_shader_free(pl_shader *psh) +{ + pl_shader sh = *psh; + if (!sh) + return; + + sh_deref(sh); + pl_shader_info_deref((pl_shader_info *) &sh->info); + pl_free_ptr(psh); +} + +void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params) +{ + sh_deref(sh); + + struct pl_shader_t new = { + .log = sh->log, + .tmp = sh->tmp, + .info = sh_info_recycle(sh->info), + .data.buf = sh->data.buf, + .mutable = true, + + // Preserve array allocations + .obj.elem = sh->obj.elem, + .vas.elem = sh->vas.elem, + .vars.elem = sh->vars.elem, + .descs.elem = sh->descs.elem, + .consts.elem = sh->consts.elem, + }; + + // Preserve buffer allocations + memcpy(new.buffers, sh->buffers, sizeof(new.buffers)); + for (int i = 0; i < PL_ARRAY_SIZE(new.buffers); i++) + pl_str_builder_reset(new.buffers[i]); + + *sh = new; + init_shader(sh, params); +} + +static void *sh_alloc(pl_shader sh, size_t size, size_t align) +{ + const size_t offset = PL_ALIGN2(sh->data.len, align); + const size_t req_size = offset + size; + if (req_size <= pl_get_size(sh->data.buf)) { + sh->data.len = offset + size; + return sh->data.buf + offset; + } + + // We can't realloc this buffer because various pointers will be left + // dangling, so just reparent it onto `sh->tmp` (so it will be cleaned + // up when the shader is next reset) and allocate a new, larger buffer + // in its place + const size_t new_size = PL_MAX(req_size << 1, 256); + pl_steal(sh->tmp, sh->data.buf); + sh->data.buf = pl_alloc(sh, new_size); + sh->data.len = size; + return sh->data.buf; +} + +static void *sh_memdup(pl_shader sh, const void *data, size_t size, size_t align) +{ + if (!size) + return NULL; + + void *dst = sh_alloc(sh, size, align); + assert(data); + memcpy(dst, data, size); + return dst; +} + +bool pl_shader_is_failed(const pl_shader sh) +{ + return sh->failed; +} + +struct pl_glsl_version sh_glsl(const pl_shader sh) +{ + if (SH_PARAMS(sh).glsl.version) + return SH_PARAMS(sh).glsl; + + if (SH_GPU(sh)) + return SH_GPU(sh)->glsl; + + return (struct pl_glsl_version) { .version = 130 }; +} + +bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem) +{ + pl_assert(bw && bh); + int *sh_bw = &sh->group_size[0]; + int *sh_bh = &sh->group_size[1]; + + struct pl_glsl_version glsl = sh_glsl(sh); + if (!glsl.compute) { + PL_TRACE(sh, "Disabling compute shader due to missing `compute` support"); + return false; + } + + if (sh->shmem + mem > glsl.max_shmem_size) { + PL_TRACE(sh, "Disabling compute shader due to insufficient shmem"); + return false; + } + + if (sh->type == SH_FRAGMENT) { + PL_TRACE(sh, "Disabling compute shader because shader is already marked " + "as fragment shader"); + return false; + } + + if (bw > glsl.max_group_size[0] || + bh > glsl.max_group_size[1] || + (bw * bh) > glsl.max_group_threads) + { + if (!flex) { + PL_TRACE(sh, "Disabling compute shader due to exceeded group " + "thread count."); + return false; + } else { + // Pick better group sizes + bw = PL_MIN(bw, glsl.max_group_size[0]); + bh = glsl.max_group_threads / bw; + } + } + + sh->shmem += mem; + + // If the current shader is either not a compute shader, or we have no + // choice but to override the metadata, always do so + if (sh->type != SH_COMPUTE || (sh->flexible_work_groups && !flex)) { + *sh_bw = bw; + *sh_bh = bh; + sh->type = SH_COMPUTE; + sh->flexible_work_groups = flex; + return true; + } + + // If both shaders are flexible, pick the larger of the two + if (sh->flexible_work_groups && flex) { + *sh_bw = PL_MAX(*sh_bw, bw); + *sh_bh = PL_MAX(*sh_bh, bh); + pl_assert(*sh_bw * *sh_bh <= glsl.max_group_threads); + return true; + } + + // At this point we're looking only at a non-flexible compute shader + pl_assert(sh->type == SH_COMPUTE && !sh->flexible_work_groups); + if (!flex) { + // Ensure parameters match + if (bw != *sh_bw || bh != *sh_bh) { + PL_TRACE(sh, "Disabling compute shader due to incompatible group " + "sizes %dx%d and %dx%d", *sh_bw, *sh_bh, bw, bh); + sh->shmem -= mem; + return false; + } + } + + return true; +} + +bool pl_shader_is_compute(const pl_shader sh) +{ + return sh->type == SH_COMPUTE; +} + +bool pl_shader_output_size(const pl_shader sh, int *w, int *h) +{ + if (!sh->output_w || !sh->output_h) + return false; + + *w = sh->transpose ? sh->output_h : sh->output_w; + *h = sh->transpose ? sh->output_w : sh->output_h; + return true; +} + +ident_t sh_fresh(pl_shader sh, const char *name) +{ + unsigned short id = ++sh->fresh; + assert(!(sh->prefix & id)); + id |= sh->prefix; + + assert(name); + return sh_mkident(id, name); +} + +static inline ident_t sh_fresh_name(pl_shader sh, const char **pname) +{ + ident_t id = sh_fresh(sh, *pname); + *pname = sh_ident_pack(id); + return id; +} + +ident_t sh_var(pl_shader sh, struct pl_shader_var sv) +{ + ident_t id = sh_fresh_name(sh, &sv.var.name); + struct pl_var_layout layout = pl_var_host_layout(0, &sv.var); + sv.data = sh_memdup(sh, sv.data, layout.size, layout.stride); + PL_ARRAY_APPEND(sh, sh->vars, sv); + return id; +} + +ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_int(name), + .data = &val, + .dynamic = dynamic, + }); +} + +ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_uint(name), + .data = &val, + .dynamic = dynamic, + }); +} + +ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float(name), + .data = &val, + .dynamic = dynamic, + }); +} + +ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3(name), + .data = PL_TRANSPOSE_3X3(val.m), + }); +} + +ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd) +{ + switch (sd.desc.type) { + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: + for (int i = 0; i < sh->descs.num; i++) // ensure uniqueness + pl_assert(sh->descs.elem[i].binding.object != sd.binding.object); + size_t bsize = sizeof(sd.buffer_vars[0]) * sd.num_buffer_vars; + sd.buffer_vars = sh_memdup(sh, sd.buffer_vars, bsize, + alignof(struct pl_buffer_var)); + for (int i = 0; i < sd.num_buffer_vars; i++) { + struct pl_var *bv = &sd.buffer_vars[i].var; + const char *name = bv->name; + GLSLP("#define %s "$"\n", name, sh_fresh_name(sh, &bv->name)); + } + break; + + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: + pl_assert(!sd.num_buffer_vars); + break; + + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + + ident_t id = sh_fresh_name(sh, &sd.desc.name); + PL_ARRAY_APPEND(sh, sh->descs, sd); + return id; +} + +ident_t sh_const(pl_shader sh, struct pl_shader_const sc) +{ + if (SH_PARAMS(sh).dynamic_constants && !sc.compile_time) { + return sh_var(sh, (struct pl_shader_var) { + .var = { + .name = sc.name, + .type = sc.type, + .dim_v = 1, + .dim_m = 1, + .dim_a = 1, + }, + .data = sc.data, + }); + } + + ident_t id = sh_fresh_name(sh, &sc.name); + + pl_gpu gpu = SH_GPU(sh); + if (gpu && gpu->limits.max_constants) { + if (!sc.compile_time || gpu->limits.array_size_constants) { + size_t size = pl_var_type_size(sc.type); + sc.data = sh_memdup(sh, sc.data, size, size); + PL_ARRAY_APPEND(sh, sh->consts, sc); + return id; + } + } + + // Fallback for GPUs without specialization constants + switch (sc.type) { + case PL_VAR_SINT: + GLSLH("const int "$" = %d; \n", id, *(int *) sc.data); + return id; + case PL_VAR_UINT: + GLSLH("const uint "$" = uint(%u); \n", id, *(unsigned int *) sc.data); + return id; + case PL_VAR_FLOAT: + GLSLH("const float "$" = float(%f); \n", id, *(float *) sc.data); + return id; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +ident_t sh_const_int(pl_shader sh, const char *name, int val) +{ + return sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_SINT, + .name = name, + .data = &val, + }); +} + +ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val) +{ + return sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_UINT, + .name = name, + .data = &val, + }); +} + +ident_t sh_const_float(pl_shader sh, const char *name, float val) +{ + return sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_FLOAT, + .name = name, + .data = &val, + }); +} + +ident_t sh_attr(pl_shader sh, struct pl_shader_va sva) +{ + const size_t vsize = sva.attr.fmt->texel_size; + uint8_t *data = sh_alloc(sh, vsize * 4, vsize); + for (int i = 0; i < 4; i++) { + memcpy(data, sva.data[i], vsize); + sva.data[i] = data; + data += vsize; + } + + ident_t id = sh_fresh_name(sh, &sva.attr.name); + PL_ARRAY_APPEND(sh, sh->vas, sva); + return id; +} + +ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc) +{ + pl_gpu gpu = SH_GPU(sh); + if (!gpu) { + SH_FAIL(sh, "Failed adding vertex attr '%s': No GPU available!", name); + return NULL_IDENT; + } + + pl_fmt fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2); + if (!fmt) { + SH_FAIL(sh, "Failed adding vertex attr '%s': no vertex fmt!", name); + return NULL_IDENT; + } + + float verts[4][2] = { + { rc->x0, rc->y0 }, + { rc->x1, rc->y0 }, + { rc->x0, rc->y1 }, + { rc->x1, rc->y1 }, + }; + + return sh_attr(sh, (struct pl_shader_va) { + .attr = { + .name = name, + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2), + }, + .data = { verts[0], verts[1], verts[2], verts[3] }, + }); +} + +ident_t sh_bind(pl_shader sh, pl_tex tex, + enum pl_tex_address_mode address_mode, + enum pl_tex_sample_mode sample_mode, + const char *name, const pl_rect2df *rect, + ident_t *out_pos, ident_t *out_pt) +{ + if (pl_tex_params_dimension(tex->params) != 2) { + SH_FAIL(sh, "Failed binding texture '%s': not a 2D texture!", name); + return NULL_IDENT; + } + + if (!tex->params.sampleable) { + SH_FAIL(sh, "Failed binding texture '%s': texture not sampleable!", name); + return NULL_IDENT; + } + + ident_t itex = sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = name, + .type = PL_DESC_SAMPLED_TEX, + }, + .binding = { + .object = tex, + .address_mode = address_mode, + .sample_mode = sample_mode, + }, + }); + + float sx, sy; + if (tex->sampler_type == PL_SAMPLER_RECT) { + sx = 1.0; + sy = 1.0; + } else { + sx = 1.0 / tex->params.w; + sy = 1.0 / tex->params.h; + } + + if (out_pos) { + pl_rect2df full = { + .x1 = tex->params.w, + .y1 = tex->params.h, + }; + + rect = PL_DEF(rect, &full); + *out_pos = sh_attr_vec2(sh, "tex_coord", &(pl_rect2df) { + .x0 = sx * rect->x0, .y0 = sy * rect->y0, + .x1 = sx * rect->x1, .y1 = sy * rect->y1, + }); + } + + if (out_pt) { + *out_pt = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tex_pt"), + .data = &(float[2]) {sx, sy}, + }); + } + + return itex; +} + +bool sh_buf_desc_append(void *alloc, pl_gpu gpu, + struct pl_shader_desc *buf_desc, + struct pl_var_layout *out_layout, + const struct pl_var new_var) +{ + struct pl_buffer_var bv = { .var = new_var }; + size_t cur_size = sh_buf_desc_size(buf_desc); + + switch (buf_desc->desc.type) { + case PL_DESC_BUF_UNIFORM: + bv.layout = pl_std140_layout(cur_size, &new_var); + if (bv.layout.offset + bv.layout.size > gpu->limits.max_ubo_size) + return false; + break; + case PL_DESC_BUF_STORAGE: + bv.layout = pl_std430_layout(cur_size, &new_var); + if (bv.layout.offset + bv.layout.size > gpu->limits.max_ssbo_size) + return false; + break; + case PL_DESC_INVALID: + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + + if (out_layout) + *out_layout = bv.layout; + PL_ARRAY_APPEND_RAW(alloc, buf_desc->buffer_vars, buf_desc->num_buffer_vars, bv); + return true; +} + +size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc) +{ + if (!buf_desc->num_buffer_vars) + return 0; + + const struct pl_buffer_var *last; + last = &buf_desc->buffer_vars[buf_desc->num_buffer_vars - 1]; + return last->layout.offset + last->layout.size; +} + +void sh_describef(pl_shader sh, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + sh_describe(sh, pl_vasprintf(sh->info->tmp, fmt, ap)); + va_end(ap); +} + +static const char *insigs[] = { + [PL_SHADER_SIG_NONE] = "", + [PL_SHADER_SIG_COLOR] = "vec4 color", +}; + +static const char *outsigs[] = { + [PL_SHADER_SIG_NONE] = "void", + [PL_SHADER_SIG_COLOR] = "vec4", +}; + +static const char *retvals[] = { + [PL_SHADER_SIG_NONE] = "", + [PL_SHADER_SIG_COLOR] = "return color;", +}; + +// libplacebo currently only allows 2D samplers for shader signatures +static const char *samplers2D[] = { + [PL_SAMPLER_NORMAL] = "sampler2D", + [PL_SAMPLER_RECT] = "sampler2DRect", + [PL_SAMPLER_EXTERNAL] = "samplerExternalOES", +}; + +ident_t sh_subpass(pl_shader sh, pl_shader sub) +{ + pl_assert(sh->mutable); + + if (sh->prefix == sub->prefix) { + PL_TRACE(sh, "Can't merge shaders: conflicting identifiers!"); + return NULL_IDENT; + } + + // Check for shader compatibility + int res_w = PL_DEF(sh->output_w, sub->output_w), + res_h = PL_DEF(sh->output_h, sub->output_h); + + if ((sub->output_w && res_w != sub->output_w) || + (sub->output_h && res_h != sub->output_h)) + { + PL_TRACE(sh, "Can't merge shaders: incompatible sizes: %dx%d and %dx%d", + sh->output_w, sh->output_h, sub->output_w, sub->output_h); + return NULL_IDENT; + } + + if (sub->type == SH_COMPUTE) { + int subw = sub->group_size[0], + subh = sub->group_size[1]; + bool flex = sub->flexible_work_groups; + + if (!sh_try_compute(sh, subw, subh, flex, sub->shmem)) { + PL_TRACE(sh, "Can't merge shaders: incompatible block sizes or " + "exceeded shared memory resource capabilities"); + return NULL_IDENT; + } + } + + sh->output_w = res_w; + sh->output_h = res_h; + + // Append the prelude and header + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sub->buffers[SH_BUF_PRELUDE]); + pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_HEADER]); + + // Append the body as a new header function + if (sub->input == PL_SHADER_SIG_SAMPLER) { + pl_assert(sub->sampler_prefix); + GLSLH("%s "$"(%c%s src_tex, vec2 tex_coord) {\n", + outsigs[sub->output], sub->name, + sub->sampler_prefix, samplers2D[sub->sampler_type]); + } else { + GLSLH("%s "$"(%s) {\n", + outsigs[sub->output], sub->name, insigs[sub->input]); + } + pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_BODY]); + GLSLH("%s\n}\n\n", retvals[sub->output]); + + // Steal all inputs and objects from the subpass +#define ARRAY_STEAL(arr) do \ +{ \ + PL_ARRAY_CONCAT(sh, sh->arr, sub->arr); \ + sub->arr.num = 0; \ +} while (0) + + ARRAY_STEAL(obj); + ARRAY_STEAL(vas); + ARRAY_STEAL(vars); + ARRAY_STEAL(descs); + ARRAY_STEAL(consts); +#undef ARRAY_STEAL + + // Steal the scratch buffer (if it holds data) + if (sub->data.len) { + pl_steal(sh->tmp, sub->data.buf); + sub->data = (pl_str) {0}; + } + + // Steal all temporary allocations and mark the child as unusable + pl_steal(sh->tmp, sub->tmp); + sub->tmp = pl_tmp(sub); + sub->failed = true; + + // Steal the shader steps array (and allocations) + pl_assert(pl_rc_count(&sub->info->rc) == 1); + PL_ARRAY_CONCAT(sh->info, sh->info->steps, sub->info->steps); + pl_steal(sh->info->tmp, sub->info->tmp); + sub->info->tmp = pl_tmp(sub->info); + sub->info->steps.num = 0; // sanity + + return sub->name; +} + +pl_str_builder sh_finalize_internal(pl_shader sh) +{ + pl_assert(sh->mutable); // this function should only ever be called once + if (sh->failed) + return NULL; + + // Padding for readability + GLSLP("\n"); + + // Concatenate everything onto the prelude to form the final output + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_HEADER]); + + if (sh->input == PL_SHADER_SIG_SAMPLER) { + pl_assert(sh->sampler_prefix); + GLSLP("%s "$"(%c%s src_tex, vec2 tex_coord) {\n", + outsigs[sh->output], sh->name, + sh->sampler_prefix, + samplers2D[sh->sampler_type]); + } else { + GLSLP("%s "$"(%s) {\n", outsigs[sh->output], sh->name, insigs[sh->input]); + } + + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_BODY]); + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_FOOTER]); + GLSLP("%s\n}\n\n", retvals[sh->output]); + + // Generate the shader info + struct sh_info *info = sh->info; + info->info.steps = info->steps.elem; + info->info.num_steps = info->steps.num; + info->info.description = "(unknown shader)"; + + // Generate pretty description + for (int i = 0; i < info->steps.num; i++) { + const char *step = info->steps.elem[i]; + + // Prevent duplicates. We're okay using a weak equality check here + // because most pass descriptions are static strings. + for (int j = 0; j < i; j++) { + if (info->steps.elem[j] == step) + goto next_step; + } + + int count = 1; + for (int j = i+1; j < info->steps.num; j++) { + if (info->steps.elem[j] == step) + count++; + } + + const char *prefix = i > 0 ? ", " : ""; + if (count > 1) { + pl_str_append_asprintf(info, &info->desc, "%s%s x%d", + prefix, step, count); + } else { + pl_str_append_asprintf(info, &info->desc, "%s%s", prefix, step); + } + +next_step: ; + } + + if (info->desc.len) + info->info.description = (char *) info->desc.buf; + + sh->mutable = false; + return sh->buffers[SH_BUF_PRELUDE]; +} + +const struct pl_shader_res *pl_shader_finalize(pl_shader sh) +{ + if (sh->failed) { + return NULL; + } else if (!sh->mutable) { + return &sh->result; + } + + pl_shader_info info = &sh->info->info; + pl_str_builder glsl = sh_finalize_internal(sh); + + // Turn ident_t into friendly strings before passing it to users +#define FIX_IDENT(name) \ + name = sh_ident_tostr(sh_ident_unpack(name)) + for (int i = 0; i < sh->vas.num; i++) + FIX_IDENT(sh->vas.elem[i].attr.name); + for (int i = 0; i < sh->vars.num; i++) + FIX_IDENT(sh->vars.elem[i].var.name); + for (int i = 0; i < sh->consts.num; i++) + FIX_IDENT(sh->consts.elem[i].name); + for (int i = 0; i < sh->descs.num; i++) { + struct pl_shader_desc *sd = &sh->descs.elem[i]; + FIX_IDENT(sd->desc.name); + for (int j = 0; j < sd->num_buffer_vars; sd++) + FIX_IDENT(sd->buffer_vars[j].var.name); + } +#undef FIX_IDENT + + sh->result = (struct pl_shader_res) { + .info = info, + .glsl = (char *) pl_str_builder_exec(glsl).buf, + .name = sh_ident_tostr(sh->name), + .input = sh->input, + .output = sh->output, + .compute_group_size = { sh->group_size[0], sh->group_size[1] }, + .compute_shmem = sh->shmem, + .vertex_attribs = sh->vas.elem, + .num_vertex_attribs = sh->vas.num, + .variables = sh->vars.elem, + .num_variables = sh->vars.num, + .descriptors = sh->descs.elem, + .num_descriptors = sh->descs.num, + .constants = sh->consts.elem, + .num_constants = sh->consts.num, + // deprecated fields + .params = info->params, + .steps = info->steps, + .num_steps = info->num_steps, + .description = info->description, + }; + + return &sh->result; +} + +bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h) +{ + if (sh->failed) { + SH_FAIL(sh, "Attempting to modify a failed shader!"); + return false; + } + + if (!sh->mutable) { + SH_FAIL(sh, "Attempted to modify an immutable shader!"); + return false; + } + + if ((w && sh->output_w && sh->output_w != w) || + (h && sh->output_h && sh->output_h != h)) + { + SH_FAIL(sh, "Illegal sequence of shader operations: Incompatible " + "output size requirements %dx%d and %dx%d", + sh->output_w, sh->output_h, w, h); + return false; + } + + static const char *names[] = { + [PL_SHADER_SIG_NONE] = "PL_SHADER_SIG_NONE", + [PL_SHADER_SIG_COLOR] = "PL_SHADER_SIG_COLOR", + }; + + // If we require an input, but there is none available - just get it from + // the user by turning it into an explicit input signature. + if (!sh->output && insig) { + pl_assert(!sh->input); + sh->input = insig; + } else if (sh->output != insig) { + SH_FAIL(sh, "Illegal sequence of shader operations! Current output " + "signature is '%s', but called operation expects '%s'!", + names[sh->output], names[insig]); + return false; + } + + // All of our shaders end up returning a vec4 color + sh->output = PL_SHADER_SIG_COLOR; + sh->output_w = PL_DEF(sh->output_w, w); + sh->output_h = PL_DEF(sh->output_h, h); + return true; +} + +static void sh_obj_deref(pl_shader_obj obj) +{ + if (!pl_rc_deref(&obj->rc)) + return; + + if (obj->uninit) + obj->uninit(obj->gpu, obj->priv); + + pl_free(obj); +} + +void pl_shader_obj_destroy(pl_shader_obj *ptr) +{ + pl_shader_obj obj = *ptr; + if (!obj) + return; + + sh_obj_deref(obj); + *ptr = NULL; +} + +void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr, + enum pl_shader_obj_type type, size_t priv_size, + void (*uninit)(pl_gpu gpu, void *priv)) +{ + if (!ptr) + return NULL; + + pl_shader_obj obj = *ptr; + if (obj && obj->gpu != SH_GPU(sh)) { + SH_FAIL(sh, "Passed pl_shader_obj belongs to different GPU!"); + return NULL; + } + + if (obj && obj->type != type) { + SH_FAIL(sh, "Passed pl_shader_obj of wrong type! Shader objects must " + "always be used with the same type of shader."); + return NULL; + } + + if (!obj) { + obj = pl_zalloc_ptr(NULL, obj); + pl_rc_init(&obj->rc); + obj->gpu = SH_GPU(sh); + obj->type = type; + obj->priv = pl_zalloc(obj, priv_size); + obj->uninit = uninit; + } + + PL_ARRAY_APPEND(sh, sh->obj, obj); + pl_rc_ref(&obj->rc); + + *ptr = obj; + return obj->priv; +} + +ident_t sh_prng(pl_shader sh, bool temporal, ident_t *p_state) +{ + ident_t randfun = sh_fresh(sh, "rand"), + state = sh_fresh(sh, "state"); + + // Based on pcg3d (http://jcgt.org/published/0009/03/02/) + GLSLP("#define prng_t uvec3\n"); + GLSLH("vec3 "$"(inout uvec3 s) { \n" + " s = 1664525u * s + uvec3(1013904223u); \n" + " s.x += s.y * s.z; \n" + " s.y += s.z * s.x; \n" + " s.z += s.x * s.y; \n" + " s ^= s >> 16u; \n" + " s.x += s.y * s.z; \n" + " s.y += s.z * s.x; \n" + " s.z += s.x * s.y; \n" + " return vec3(s) * 1.0/float(0xFFFFFFFFu); \n" + "} \n", + randfun); + + if (temporal) { + GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, "$"); \n", + state, SH_UINT_DYN(SH_PARAMS(sh).index)); + } else { + GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, 0.0); \n", state); + } + + if (p_state) + *p_state = state; + + ident_t res = sh_fresh(sh, "RAND"); + GLSLH("#define "$" ("$"("$"))\n", res, randfun, state); + return res; +} diff --git a/src/shaders.h b/src/shaders.h new file mode 100644 index 0000000..7656a35 --- /dev/null +++ b/src/shaders.h @@ -0,0 +1,387 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdio.h> +#include <limits.h> + +#include "common.h" +#include "cache.h" +#include "log.h" +#include "gpu.h" + +#include <libplacebo/shaders.h> + +// This represents an identifier (e.g. name of function, uniform etc.) for +// a shader resource. Not human-readable. + +typedef unsigned short ident_t; +#define $ "_%hx" +#define NULL_IDENT 0u + +#define sh_mkident(id, name) ((ident_t) id) +#define sh_ident_tostr(id) pl_asprintf(sh->tmp, $, id) + +enum { + IDENT_BITS = 8 * sizeof(ident_t), + IDENT_MASK = (uintptr_t) USHRT_MAX, + IDENT_SENTINEL = (uintptr_t) 0x20230319 << IDENT_BITS, +}; + +// Functions to pack/unpack an identifier into a `const char *` name field. +// Used to defer string templating of friendly names until actually necessary +static inline const char *sh_ident_pack(ident_t id) +{ + return (const char *)(uintptr_t) (IDENT_SENTINEL | id); +} + +static inline ident_t sh_ident_unpack(const char *name) +{ + uintptr_t uname = (uintptr_t) name; + assert((uname & ~IDENT_MASK) == IDENT_SENTINEL); + return uname & IDENT_MASK; +} + +enum pl_shader_buf { + SH_BUF_PRELUDE, // extra #defines etc. + SH_BUF_HEADER, // previous passes, helper function definitions, etc. + SH_BUF_BODY, // partial contents of the "current" function + SH_BUF_FOOTER, // will be appended to the end of the current function + SH_BUF_COUNT, +}; + +enum pl_shader_type { + SH_AUTO, + SH_COMPUTE, + SH_FRAGMENT +}; + +struct sh_info { + // public-facing struct + struct pl_shader_info_t info; + + // internal fields + void *tmp; + pl_rc_t rc; + pl_str desc; + PL_ARRAY(const char *) steps; +}; + +struct pl_shader_t { + pl_log log; + void *tmp; // temporary allocations (freed on pl_shader_reset) + struct sh_info *info; + pl_str data; // pooled/recycled scratch buffer for small allocations + PL_ARRAY(pl_shader_obj) obj; + bool failed; + bool mutable; + ident_t name; + enum pl_shader_sig input, output; + int output_w; + int output_h; + bool transpose; + pl_str_builder buffers[SH_BUF_COUNT]; + enum pl_shader_type type; + bool flexible_work_groups; + int group_size[2]; + size_t shmem; + enum pl_sampler_type sampler_type; + char sampler_prefix; + unsigned short prefix; // pre-processed version of res.params.id + unsigned short fresh; + + // Note: internally, these `pl_shader_va` etc. use raw ident_t fields + // instead of `const char *` wherever a name is required! These are + // translated to legal strings either in `pl_shader_finalize`, or inside + // the `pl_dispatch` shader compilation step. + PL_ARRAY(struct pl_shader_va) vas; + PL_ARRAY(struct pl_shader_var) vars; + PL_ARRAY(struct pl_shader_desc) descs; + PL_ARRAY(struct pl_shader_const) consts; + + // cached result of `pl_shader_finalize` + struct pl_shader_res result; +}; + +// Free temporary resources associated with a shader. Normally called by +// pl_shader_reset(), but used internally to reduce memory waste. +void sh_deref(pl_shader sh); + +// Same as `pl_shader_finalize` but doesn't generate `sh->res`, instead returns +// the string builder to be used to finalize the shader. Assumes the caller +// will access the shader's internal fields directly. +pl_str_builder sh_finalize_internal(pl_shader sh); + +// Helper functions for convenience +#define SH_PARAMS(sh) ((sh)->info->info.params) +#define SH_GPU(sh) (SH_PARAMS(sh).gpu) +#define SH_CACHE(sh) pl_gpu_cache(SH_GPU(sh)) + +// Returns the GLSL version, defaulting to desktop 130. +struct pl_glsl_version sh_glsl(const pl_shader sh); + +#define SH_FAIL(sh, ...) do { \ + sh->failed = true; \ + PL_ERR(sh, __VA_ARGS__); \ + } while (0) + +// Attempt enabling compute shaders for this pass, if possible +bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem); + +// Attempt merging a secondary shader into the current shader. Returns NULL if +// merging fails (e.g. incompatible signatures); otherwise returns an identifier +// corresponding to the generated subpass function. +// +// If successful, the subpass shader is set to an undefined failure state and +// must be explicitly reset/aborted before being re-used. +ident_t sh_subpass(pl_shader sh, pl_shader sub); + +// Helpers for adding new variables/descriptors/etc. with fresh, unique +// identifier names. These will never conflict with other identifiers, even +// if the shaders are merged together. +ident_t sh_fresh(pl_shader sh, const char *name); + +// Add a new shader var and return its identifier +ident_t sh_var(pl_shader sh, struct pl_shader_var sv); + +// Helper functions for `sh_var` +ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic); +ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic); +ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic); +ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val); +#define SH_INT_DYN(val) sh_var_int(sh, "const", val, true) +#define SH_UINT_DYN(val) sh_var_uint(sh, "const", val, true) +#define SH_FLOAT_DYN(val) sh_var_float(sh, "const", val, true) +#define SH_MAT3(val) sh_var_mat3(sh, "mat", val) + +// Add a new shader desc and return its identifier. +ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd); + +// Add a new shader constant and return its identifier. +ident_t sh_const(pl_shader sh, struct pl_shader_const sc); + +// Helper functions for `sh_const` +ident_t sh_const_int(pl_shader sh, const char *name, int val); +ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val); +ident_t sh_const_float(pl_shader sh, const char *name, float val); +#define SH_INT(val) sh_const_int(sh, "const", val) +#define SH_UINT(val) sh_const_uint(sh, "const", val) +#define SH_FLOAT(val) sh_const_float(sh, "const", val) + +// Add a new shader va and return its identifier +ident_t sh_attr(pl_shader sh, struct pl_shader_va sva); + +// Helper to add a a vec2 VA from a pl_rect2df. Returns NULL_IDENT on failure. +ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc); + +// Bind a texture under a given transformation and make its attributes +// available as well. If an output pointer for one of the attributes is left +// as NULL, that attribute will not be added. Returns NULL on failure. `rect` +// is optional, and defaults to the full texture if left as NULL. +// +// Note that for e.g. compute shaders, the vec2 out_pos might be a macro that +// expands to an expensive computation, and should be cached by the user. +ident_t sh_bind(pl_shader sh, pl_tex tex, + enum pl_tex_address_mode address_mode, + enum pl_tex_sample_mode sample_mode, + const char *name, const pl_rect2df *rect, + ident_t *out_pos, ident_t *out_pt); + +// Incrementally build up a buffer by adding new variable elements to the +// buffer, resizing buf.buffer_vars if necessary. Returns whether or not the +// variable could be successfully added (which may fail if you try exceeding +// the size limits of the buffer type). If successful, the layout is stored +// in *out_layout (may be NULL). +bool sh_buf_desc_append(void *alloc, pl_gpu gpu, + struct pl_shader_desc *buf_desc, + struct pl_var_layout *out_layout, + const struct pl_var new_var); + +size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc); + + +// Underlying function for appending text to a shader +#define sh_append(sh, buf, ...) \ + pl_str_builder_addf((sh)->buffers[buf], __VA_ARGS__) + +#define sh_append_str(sh, buf, str) \ + pl_str_builder_str((sh)->buffers[buf], str) + +#define GLSLP(...) sh_append(sh, SH_BUF_PRELUDE, __VA_ARGS__) +#define GLSLH(...) sh_append(sh, SH_BUF_HEADER, __VA_ARGS__) +#define GLSL(...) sh_append(sh, SH_BUF_BODY, __VA_ARGS__) +#define GLSLF(...) sh_append(sh, SH_BUF_FOOTER, __VA_ARGS__) + +// Attach a description to a shader +void sh_describef(pl_shader sh, const char *fmt, ...) + PL_PRINTF(2, 3); + +static inline void sh_describe(pl_shader sh, const char *desc) +{ + PL_ARRAY_APPEND(sh->info, sh->info->steps, desc); +}; + +// Requires that the share is mutable, has an output signature compatible +// with the given input signature, as well as an output size compatible with +// the given size requirements. Errors and returns false otherwise. +bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h); + +// Shader resources + +enum pl_shader_obj_type { + PL_SHADER_OBJ_INVALID = 0, + PL_SHADER_OBJ_COLOR_MAP, + PL_SHADER_OBJ_SAMPLER, + PL_SHADER_OBJ_DITHER, + PL_SHADER_OBJ_LUT, + PL_SHADER_OBJ_AV1_GRAIN, + PL_SHADER_OBJ_FILM_GRAIN, + PL_SHADER_OBJ_RESHAPE, +}; + +struct pl_shader_obj_t { + enum pl_shader_obj_type type; + pl_rc_t rc; + pl_gpu gpu; + void (*uninit)(pl_gpu gpu, void *priv); + void *priv; +}; + +// Returns (*ptr)->priv, or NULL on failure +void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr, + enum pl_shader_obj_type type, size_t priv_size, + void (*uninit)(pl_gpu gpu, void *priv)); + +#define SH_OBJ(sh, ptr, type, t, uninit) \ + ((t*) sh_require_obj(sh, ptr, type, sizeof(t), uninit)) + +// Initializes a PRNG. The resulting string will directly evaluate to a +// pseudorandom, uniformly distributed vec3 from [0.0,1.0]. Since this +// algorithm works by mutating a state variable, if the user wants to use the +// resulting PRNG inside a subfunction, they must add an extra `inout prng_t %s` +// with the contents of `state` to the signature. (Optional) +// +// If `temporal` is set, the PRNG will vary across frames. +ident_t sh_prng(pl_shader sh, bool temporal, ident_t *state); + +// Backing memory type +enum sh_lut_type { + SH_LUT_AUTO = 0, // pick whatever makes the most sense + SH_LUT_TEXTURE, // upload as texture + SH_LUT_UNIFORM, // uniform array + SH_LUT_LITERAL, // constant / literal array in shader source (fallback) +}; + +// Interpolation method +enum sh_lut_method { + SH_LUT_NONE = 0, // no interpolation, integer indices + SH_LUT_LINEAR, // linear interpolation, vecN indices in range [0,1] + SH_LUT_CUBIC, // (bi/tri)cubic interpolation + SH_LUT_TETRAHEDRAL, // tetrahedral interpolation for vec3, equivalent to + // SH_LUT_LINEAR for lower dimensions +}; + +struct sh_lut_params { + pl_shader_obj *object; + + // Type of the LUT we intend to generate. + // + // Note: If `var_type` is PL_VAR_*INT, `method` must be SH_LUT_NONE. + enum pl_var_type var_type; + enum sh_lut_type lut_type; + enum sh_lut_method method; + + // For SH_LUT_TEXTURE, this can be used to override the texture's internal + // format, in which case it takes precedence over the default for `type`. + pl_fmt fmt; + + // LUT dimensions. Unused dimensions may be left as 0. + int width; + int height; + int depth; + int comps; + + // If true, the LUT will always be regenerated, even if the dimensions have + // not changed. + bool update; + + // Alternate way of triggering shader invalidations. If the signature + // does not match the LUT's signature, it will be regenerated. + uint64_t signature; + + // If set to true, shader objects will be preserved and updated in-place + // rather than being treated as read-only. + bool dynamic; + + // If set , generated shader objects are automatically cached in this + // cache. Requires `signature` to be set (and uniquely identify the LUT). + pl_cache cache; + + // Will be called with a zero-initialized buffer whenever the data needs to + // be computed, which happens whenever the size is changed, the shader + // object is invalidated, or `update` is set to true. + // + // Note: Interpretation of `data` is according to `type` and `fmt`. + void (*fill)(void *data, const struct sh_lut_params *params); + void *priv; + + // Debug tag to track LUT source + pl_debug_tag debug_tag; +}; + +#define sh_lut_params(...) (&(struct sh_lut_params) { \ + .debug_tag = PL_DEBUG_TAG, \ + __VA_ARGS__ \ + }) + +// Makes a table of values available as a shader variable, using an a given +// method (falling back if needed). The resulting identifier can be sampled +// directly as %s(pos), where pos is a vector with the right number of +// dimensions. `pos` must be an integer vector within the bounds of the array, +// unless the method is `SH_LUT_LINEAR`, in which case it's a float vector that +// gets interpolated and clamped as needed. Returns NULL on error. +ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params); + +static inline uint8_t sh_num_comps(uint8_t mask) +{ + pl_assert((mask & 0xF) == mask); + return __builtin_popcount(mask); +} + +static inline const char *sh_float_type(uint8_t mask) +{ + switch (sh_num_comps(mask)) { + case 1: return "float"; + case 2: return "vec2"; + case 3: return "vec3"; + case 4: return "vec4"; + } + + pl_unreachable(); +} + +static inline const char *sh_swizzle(uint8_t mask) +{ + static const char * const swizzles[0x10] = { + NULL, "r", "g", "rg", "b", "rb", "gb", "rgb", + "a", "ra", "ga", "rga", "ba", "rba", "gba", "rgba", + }; + + pl_assert(mask <= PL_ARRAY_SIZE(swizzles)); + return swizzles[mask]; +} diff --git a/src/shaders/colorspace.c b/src/shaders/colorspace.c new file mode 100644 index 0000000..c7b3b5a --- /dev/null +++ b/src/shaders/colorspace.c @@ -0,0 +1,2120 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "cache.h" +#include "shaders.h" + +#include <libplacebo/shaders/colorspace.h> + +// Common constants for SMPTE ST.2084 (PQ) +static const float PQ_M1 = 2610./4096 * 1./4, + PQ_M2 = 2523./4096 * 128, + PQ_C1 = 3424./4096, + PQ_C2 = 2413./4096 * 32, + PQ_C3 = 2392./4096 * 32; + +// Common constants for ARIB STD-B67 (HLG) +static const float HLG_A = 0.17883277, + HLG_B = 0.28466892, + HLG_C = 0.55991073, + HLG_REF = 1000.0 / PL_COLOR_SDR_WHITE; + +// Common constants for Panasonic V-Log +static const float VLOG_B = 0.00873, + VLOG_C = 0.241514, + VLOG_D = 0.598206; + +// Common constants for Sony S-Log +static const float SLOG_A = 0.432699, + SLOG_B = 0.037584, + SLOG_C = 0.616596 + 0.03, + SLOG_P = 3.538813, + SLOG_Q = 0.030001, + SLOG_K2 = 155.0 / 219.0; + +void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr, + enum pl_alpha_mode mode) +{ + if (repr->alpha == PL_ALPHA_PREMULTIPLIED && mode == PL_ALPHA_INDEPENDENT) { + GLSL("if (color.a > 1e-6) \n" + " color.rgb /= vec3(color.a); \n"); + repr->alpha = PL_ALPHA_INDEPENDENT; + } + + if (repr->alpha == PL_ALPHA_INDEPENDENT && mode == PL_ALPHA_PREMULTIPLIED) { + GLSL("color.rgb *= vec3(color.a); \n"); + repr->alpha = PL_ALPHA_PREMULTIPLIED; + } +} + +#ifdef PL_HAVE_DOVI +static inline void reshape_mmr(pl_shader sh, ident_t mmr, bool single, + int min_order, int max_order) +{ + if (single) { + GLSL("const uint mmr_idx = 0u; \n"); + } else { + GLSL("uint mmr_idx = uint(coeffs.y); \n"); + } + + assert(min_order <= max_order); + if (min_order < max_order) + GLSL("uint order = uint(coeffs.w); \n"); + + GLSL("vec4 sigX; \n" + "s = coeffs.x; \n" + "sigX.xyz = sig.xxy * sig.yzz; \n" + "sigX.w = sigX.x * sig.z; \n" + "s += dot("$"[mmr_idx + 0].xyz, sig); \n" + "s += dot("$"[mmr_idx + 1], sigX); \n", + mmr, mmr); + + if (max_order >= 2) { + if (min_order < 2) + GLSL("if (order >= 2) { \n"); + + GLSL("vec3 sig2 = sig * sig; \n" + "vec4 sigX2 = sigX * sigX; \n" + "s += dot("$"[mmr_idx + 2].xyz, sig2); \n" + "s += dot("$"[mmr_idx + 3], sigX2); \n", + mmr, mmr); + + if (max_order == 3) { + if (min_order < 3) + GLSL("if (order >= 3 { \n"); + + GLSL("s += dot("$"[mmr_idx + 4].xyz, sig2 * sig); \n" + "s += dot("$"[mmr_idx + 5], sigX2 * sigX); \n", + mmr, mmr); + + if (min_order < 3) + GLSL("} \n"); + } + + if (min_order < 2) + GLSL("} \n"); + } +} + +static inline void reshape_poly(pl_shader sh) +{ + GLSL("s = (coeffs.z * s + coeffs.y) * s + coeffs.x; \n"); +} +#endif + +void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data) +{ +#ifdef PL_HAVE_DOVI + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0) || !data) + return; + + sh_describe(sh, "reshaping"); + GLSL("// pl_shader_reshape \n" + "{ \n" + "vec3 sig; \n" + "vec4 coeffs; \n" + "float s; \n" + "sig = clamp(color.rgb, 0.0, 1.0); \n"); + + float coeffs_data[8][4]; + float mmr_packed_data[8*6][4]; + + for (int c = 0; c < 3; c++) { + const struct pl_reshape_data *comp = &data->comp[c]; + if (!comp->num_pivots) + continue; + + pl_assert(comp->num_pivots >= 2 && comp->num_pivots <= 9); + GLSL("s = sig[%d]; \n", c); + + // Prepare coefficients for GPU + bool has_poly = false, has_mmr = false, mmr_single = true; + int mmr_idx = 0, min_order = 3, max_order = 1; + memset(coeffs_data, 0, sizeof(coeffs_data)); + for (int i = 0; i < comp->num_pivots - 1; i++) { + switch (comp->method[i]) { + case 0: // polynomial + has_poly = true; + coeffs_data[i][3] = 0.0; // order=0 signals polynomial + for (int k = 0; k < 3; k++) + coeffs_data[i][k] = comp->poly_coeffs[i][k]; + break; + + case 1: + min_order = PL_MIN(min_order, comp->mmr_order[i]); + max_order = PL_MAX(max_order, comp->mmr_order[i]); + mmr_single = !has_mmr; + has_mmr = true; + coeffs_data[i][3] = (float) comp->mmr_order[i]; + coeffs_data[i][0] = comp->mmr_constant[i]; + coeffs_data[i][1] = (float) mmr_idx; + for (int j = 0; j < comp->mmr_order[i]; j++) { + // store weights per order as two packed vec4s + float *mmr = &mmr_packed_data[mmr_idx][0]; + mmr[0] = comp->mmr_coeffs[i][j][0]; + mmr[1] = comp->mmr_coeffs[i][j][1]; + mmr[2] = comp->mmr_coeffs[i][j][2]; + mmr[3] = 0.0; // unused + mmr[4] = comp->mmr_coeffs[i][j][3]; + mmr[5] = comp->mmr_coeffs[i][j][4]; + mmr[6] = comp->mmr_coeffs[i][j][5]; + mmr[7] = comp->mmr_coeffs[i][j][6]; + mmr_idx += 2; + } + break; + + default: + pl_unreachable(); + } + } + + if (comp->num_pivots > 2) { + + // Skip the (irrelevant) lower and upper bounds + float pivots_data[7]; + memcpy(pivots_data, comp->pivots + 1, + (comp->num_pivots - 2) * sizeof(pivots_data[0])); + + // Fill the remainder with a quasi-infinite sentinel pivot + for (int i = comp->num_pivots - 2; i < PL_ARRAY_SIZE(pivots_data); i++) + pivots_data[i] = 1e9f; + + ident_t pivots = sh_var(sh, (struct pl_shader_var) { + .data = pivots_data, + .var = { + .name = "pivots", + .type = PL_VAR_FLOAT, + .dim_v = 1, + .dim_m = 1, + .dim_a = PL_ARRAY_SIZE(pivots_data), + }, + }); + + ident_t coeffs = sh_var(sh, (struct pl_shader_var) { + .data = coeffs_data, + .var = { + .name = "coeffs", + .type = PL_VAR_FLOAT, + .dim_v = 4, + .dim_m = 1, + .dim_a = PL_ARRAY_SIZE(coeffs_data), + }, + }); + + // Efficiently branch into the correct set of coefficients + GLSL("#define test(i) bvec4(s >= "$"[i]) \n" + "#define coef(i) "$"[i] \n" + "coeffs = mix(mix(mix(coef(0), coef(1), test(0)), \n" + " mix(coef(2), coef(3), test(2)), \n" + " test(1)), \n" + " mix(mix(coef(4), coef(5), test(4)), \n" + " mix(coef(6), coef(7), test(6)), \n" + " test(5)), \n" + " test(3)); \n" + "#undef test \n" + "#undef coef \n", + pivots, coeffs); + + } else { + + // No need for a single pivot, just set the coeffs directly + GLSL("coeffs = "$"; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec4("coeffs"), + .data = coeffs_data, + })); + + } + + ident_t mmr = NULL_IDENT; + if (has_mmr) { + mmr = sh_var(sh, (struct pl_shader_var) { + .data = mmr_packed_data, + .var = { + .name = "mmr", + .type = PL_VAR_FLOAT, + .dim_v = 4, + .dim_m = 1, + .dim_a = mmr_idx, + }, + }); + } + + if (has_mmr && has_poly) { + GLSL("if (coeffs.w == 0.0) { \n"); + reshape_poly(sh); + GLSL("} else { \n"); + reshape_mmr(sh, mmr, mmr_single, min_order, max_order); + GLSL("} \n"); + } else if (has_poly) { + reshape_poly(sh); + } else { + assert(has_mmr); + GLSL("{ \n"); + reshape_mmr(sh, mmr, mmr_single, min_order, max_order); + GLSL("} \n"); + } + + ident_t lo = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("lo"), + .data = &comp->pivots[0], + }); + ident_t hi = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("hi"), + .data = &comp->pivots[comp->num_pivots - 1], + }); + GLSL("color[%d] = clamp(s, "$", "$"); \n", c, lo, hi); + } + + GLSL("} \n"); +#else + SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping"); +#endif +} + +void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr, + const struct pl_color_adjustment *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + sh_describe(sh, "color decoding"); + GLSL("// pl_shader_decode_color \n" + "{ \n"); + + // Do this first because the following operations are potentially nonlinear + pl_shader_set_alpha(sh, repr, PL_ALPHA_INDEPENDENT); + + if (repr->sys == PL_COLOR_SYSTEM_XYZ || + repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) + { + ident_t scale = SH_FLOAT(pl_color_repr_normalize(repr)); + GLSL("color.rgb *= vec3("$"); \n", scale); + } + + if (repr->sys == PL_COLOR_SYSTEM_XYZ) { + pl_shader_linearize(sh, &(struct pl_color_space) { + .transfer = PL_COLOR_TRC_ST428, + }); + } + + if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) + pl_shader_dovi_reshape(sh, repr->dovi); + + enum pl_color_system orig_sys = repr->sys; + pl_transform3x3 tr = pl_color_repr_decode(repr, params); + + if (memcmp(&tr, &pl_transform3x3_identity, sizeof(tr))) { + ident_t cmat = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("cmat"), + .data = PL_TRANSPOSE_3X3(tr.mat.m), + }); + + ident_t cmat_c = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec3("cmat_c"), + .data = tr.c, + }); + + GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c); + } + + switch (orig_sys) { + case PL_COLOR_SYSTEM_BT_2020_C: + // Conversion for C'rcY'cC'bc via the BT.2020 CL system: + // C'bc = (B'-Y'c) / 1.9404 | C'bc <= 0 + // = (B'-Y'c) / 1.5816 | C'bc > 0 + // + // C'rc = (R'-Y'c) / 1.7184 | C'rc <= 0 + // = (R'-Y'c) / 0.9936 | C'rc > 0 + // + // as per the BT.2020 specification, table 4. This is a non-linear + // transformation because (constant) luminance receives non-equal + // contributions from the three different channels. + GLSL("// constant luminance conversion \n" + "color.br = color.br * mix(vec2(1.5816, 0.9936), \n" + " vec2(1.9404, 1.7184), \n" + " lessThanEqual(color.br, vec2(0.0))) \n" + " + color.gg; \n"); + // Expand channels to camera-linear light. This shader currently just + // assumes everything uses the BT.2020 12-bit gamma function, since the + // difference between 10 and 12-bit is negligible for anything other + // than 12-bit content. + GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n" + " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n" + " vec3(1.0/0.45)), \n" + " lessThanEqual(vec3(0.08145), color.rgb)); \n"); + // Calculate the green channel from the expanded RYcB, and recompress to G' + // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B + GLSL("color.g = (lin.g - 0.2627*lin.r - 0.0593*lin.b)*1.0/0.6780; \n" + "color.g = mix(color.g * 4.5, \n" + " 1.0993 * pow(color.g, 0.45) - 0.0993, \n" + " 0.0181 <= color.g); \n"); + break; + + case PL_COLOR_SYSTEM_BT_2100_PQ:; + // Conversion process from the spec: + // + // 1. L'M'S' = cmat * ICtCp + // 2. LMS = linearize(L'M'S') (EOTF for PQ, inverse OETF for HLG) + // 3. RGB = lms2rgb * LMS + // + // After this we need to invert step 2 to arrive at non-linear RGB. + // (It's important we keep the transfer function conversion separate + // from the color system decoding, so we have to partially undo our + // work here even though we will end up linearizing later on anyway) + + GLSL(// PQ EOTF + "color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + // LMS matrix + "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n" + " -2.50645, 1.98360, -0.0989137, \n" + " 0.06984, -0.192271, 1.12486) * color.rgb; \n" + // PQ OETF + "color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + break; + + case PL_COLOR_SYSTEM_BT_2100_HLG: + GLSL(// HLG OETF^-1 + "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n" + " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " + vec3(%f), \n" + " lessThan(vec3(0.5), color.rgb)); \n" + // LMS matrix + "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n" + " -2.50645, 1.98360, -0.0989137, \n" + " 0.06984, -0.192271, 1.12486) * color.rgb; \n" + // HLG OETF + "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n" + " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n" + " lessThan(vec3(1.0), color.rgb)); \n", + HLG_C, HLG_A, HLG_B, + HLG_A, HLG_B, HLG_C); + break; + + case PL_COLOR_SYSTEM_DOLBYVISION:; +#ifdef PL_HAVE_DOVI + // Dolby Vision always outputs BT.2020-referred HPE LMS, so hard-code + // the inverse LMS->RGB matrix corresponding to this color space. + pl_matrix3x3 dovi_lms2rgb = {{ + { 3.06441879, -2.16597676, 0.10155818}, + {-0.65612108, 1.78554118, -0.12943749}, + { 0.01736321, -0.04725154, 1.03004253}, + }}; + + pl_matrix3x3_mul(&dovi_lms2rgb, &repr->dovi->linear); + ident_t mat = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("lms2rgb"), + .data = PL_TRANSPOSE_3X3(dovi_lms2rgb.m), + }); + + // PQ EOTF + GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1); + // LMS matrix + GLSL("color.rgb = "$" * color.rgb; \n", mat); + // PQ OETF + GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + break; +#else + SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping"); + return; +#endif + + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + case PL_COLOR_SYSTEM_XYZ: + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_YCGCO: + break; // no special post-processing needed + + case PL_COLOR_SYSTEM_COUNT: + pl_unreachable(); + } + + // Gamma adjustment. Doing this here (in non-linear light) is technically + // somewhat wrong, but this is just an aesthetic parameter and not really + // meant for colorimetric precision, so we don't care too much. + if (params && params->gamma == 0) { + // Avoid division by zero + GLSL("color.rgb = vec3(0.0); \n"); + } else if (params && params->gamma != 1) { + ident_t gamma = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("gamma"), + .data = &(float){ 1 / params->gamma }, + }); + GLSL("color.rgb = pow(max(color.rgb, vec3(0.0)), vec3("$")); \n", gamma); + } + + GLSL("}\n"); +} + +void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + sh_describe(sh, "color encoding"); + GLSL("// pl_shader_encode_color \n" + "{ \n"); + + switch (repr->sys) { + case PL_COLOR_SYSTEM_BT_2020_C: + // Expand R'G'B' to RGB + GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n" + " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n" + " vec3(1.0/0.45)), \n" + " lessThanEqual(vec3(0.08145), color.rgb)); \n"); + + // Compute Yc from RGB and compress to R'Y'cB' + GLSL("color.g = dot(vec3(0.2627, 0.6780, 0.0593), lin); \n" + "color.g = mix(color.g * 4.5, \n" + " 1.0993 * pow(color.g, 0.45) - 0.0993, \n" + " 0.0181 <= color.g); \n"); + + // Compute C'bc and C'rc into color.br + GLSL("color.br = color.br - color.gg; \n" + "color.br *= mix(vec2(1.0/1.5816, 1.0/0.9936), \n" + " vec2(1.0/1.9404, 1.0/1.7184), \n" + " lessThanEqual(color.br, vec2(0.0))); \n"); + break; + + case PL_COLOR_SYSTEM_BT_2100_PQ:; + GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n" + " 0.523925, 0.720459, 0.075440, \n" + " 0.063965, 0.112793, 0.900394) * color.rgb; \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + break; + + case PL_COLOR_SYSTEM_BT_2100_HLG: + GLSL("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n" + " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " + vec3(%f), \n" + " lessThan(vec3(0.5), color.rgb)); \n" + "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n" + " 0.523925, 0.720459, 0.075440, \n" + " 0.063965, 0.112793, 0.900394) * color.rgb; \n" + "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n" + " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n" + " lessThan(vec3(1.0), color.rgb)); \n", + HLG_C, HLG_A, HLG_B, + HLG_A, HLG_B, HLG_C); + break; + + case PL_COLOR_SYSTEM_DOLBYVISION: + SH_FAIL(sh, "Cannot un-apply dolbyvision yet (no inverse reshaping)!"); + return; + + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + case PL_COLOR_SYSTEM_XYZ: + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_YCGCO: + break; // no special pre-processing needed + + case PL_COLOR_SYSTEM_COUNT: + pl_unreachable(); + } + + // Since this is a relatively rare operation, bypass it as much as possible + bool skip = true; + skip &= PL_DEF(repr->sys, PL_COLOR_SYSTEM_RGB) == PL_COLOR_SYSTEM_RGB; + skip &= PL_DEF(repr->levels, PL_COLOR_LEVELS_FULL) == PL_COLOR_LEVELS_FULL; + skip &= !repr->bits.sample_depth || !repr->bits.color_depth || + repr->bits.sample_depth == repr->bits.color_depth; + skip &= !repr->bits.bit_shift; + + if (!skip) { + struct pl_color_repr copy = *repr; + ident_t xyzscale = NULL_IDENT; + if (repr->sys == PL_COLOR_SYSTEM_XYZ) + xyzscale = SH_FLOAT(1.0 / pl_color_repr_normalize(©)); + + pl_transform3x3 tr = pl_color_repr_decode(©, NULL); + pl_transform3x3_invert(&tr); + + ident_t cmat = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("cmat"), + .data = PL_TRANSPOSE_3X3(tr.mat.m), + }); + + ident_t cmat_c = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec3("cmat_c"), + .data = tr.c, + }); + + GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c); + + if (repr->sys == PL_COLOR_SYSTEM_XYZ) { + pl_shader_delinearize(sh, &(struct pl_color_space) { + .transfer = PL_COLOR_TRC_ST428, + }); + GLSL("color.rgb *= vec3("$"); \n", xyzscale); + } + } + + if (repr->alpha == PL_ALPHA_PREMULTIPLIED) + GLSL("color.rgb *= vec3(color.a); \n"); + + GLSL("}\n"); +} + +static ident_t sh_luma_coeffs(pl_shader sh, const struct pl_color_space *csp) +{ + pl_matrix3x3 rgb2xyz; + rgb2xyz = pl_get_rgb2xyz_matrix(pl_raw_primaries_get(csp->primaries)); + + // FIXME: Cannot use `const vec3` due to glslang bug #2025 + ident_t coeffs = sh_fresh(sh, "luma_coeffs"); + GLSLH("#define "$" vec3("$", "$", "$") \n", coeffs, + SH_FLOAT(rgb2xyz.m[1][0]), // RGB->Y vector + SH_FLOAT(rgb2xyz.m[1][1]), + SH_FLOAT(rgb2xyz.m[1][2])); + return coeffs; +} + +void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + if (csp->transfer == PL_COLOR_TRC_LINEAR) + return; + + float csp_min, csp_max; + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = csp, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_NORM, + .out_min = &csp_min, + .out_max = &csp_max, + )); + + // Note that this clamp may technically violate the definition of + // ITU-R BT.2100, which allows for sub-blacks and super-whites to be + // displayed on the display where such would be possible. That said, the + // problem is that not all gamma curves are well-defined on the values + // outside this range, so we ignore it and just clamp anyway for sanity. + GLSL("// pl_shader_linearize \n" + "color.rgb = max(color.rgb, 0.0); \n"); + + switch (csp->transfer) { + case PL_COLOR_TRC_SRGB: + GLSL("color.rgb = mix(color.rgb * vec3(1.0/12.92), \n" + " pow((color.rgb + vec3(0.055))/vec3(1.055), \n" + " vec3(2.4)), \n" + " lessThan(vec3(0.04045), color.rgb)); \n"); + goto scale_out; + case PL_COLOR_TRC_BT_1886: { + const float lb = powf(csp_min, 1/2.4f); + const float lw = powf(csp_max, 1/2.4f); + const float a = powf(lw - lb, 2.4f); + const float b = lb / (lw - lb); + GLSL("color.rgb = "$" * pow(color.rgb + vec3("$"), vec3(2.4)); \n", + SH_FLOAT(a), SH_FLOAT(b)); + return; + } + case PL_COLOR_TRC_GAMMA18: + GLSL("color.rgb = pow(color.rgb, vec3(1.8));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA20: + GLSL("color.rgb = pow(color.rgb, vec3(2.0));\n"); + goto scale_out; + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_GAMMA22: + GLSL("color.rgb = pow(color.rgb, vec3(2.2));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA24: + GLSL("color.rgb = pow(color.rgb, vec3(2.4));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA26: + GLSL("color.rgb = pow(color.rgb, vec3(2.6));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA28: + GLSL("color.rgb = pow(color.rgb, vec3(2.8));\n"); + goto scale_out; + case PL_COLOR_TRC_PRO_PHOTO: + GLSL("color.rgb = mix(color.rgb * vec3(1.0/16.0), \n" + " pow(color.rgb, vec3(1.8)), \n" + " lessThan(vec3(0.03125), color.rgb)); \n"); + goto scale_out; + case PL_COLOR_TRC_ST428: + GLSL("color.rgb = vec3(52.37/48.0) * pow(color.rgb, vec3(2.6));\n"); + goto scale_out; + case PL_COLOR_TRC_PQ: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + // PQ's output range is 0-10000, but we need it to be relative to + // to PL_COLOR_SDR_WHITE instead, so rescale + "color.rgb *= vec3(%f); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, 10000.0 / PL_COLOR_SDR_WHITE); + return; + case PL_COLOR_TRC_HLG: { + const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1); + const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y)); + // OETF^-1 + GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n" + "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n" + " exp((color.rgb - vec3(%f)) * vec3(1.0/%f))\n" + " + vec3(%f), \n" + " lessThan(vec3(0.5), color.rgb)); \n", + SH_FLOAT(1 - b), SH_FLOAT(b), + HLG_C, HLG_A, HLG_B); + // OOTF + GLSL("color.rgb *= 1.0 / 12.0; \n" + "color.rgb *= "$" * pow(max(dot("$", color.rgb), 0.0), "$"); \n", + SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT(y - 1)); + return; + } + case PL_COLOR_TRC_V_LOG: + GLSL("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n" + " pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " - vec3(%f), \n" + " lessThanEqual(vec3(0.181), color.rgb)); \n", + VLOG_D, VLOG_C, VLOG_B); + return; + case PL_COLOR_TRC_S_LOG1: + GLSL("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " - vec3(%f); \n", + SLOG_C, SLOG_A, SLOG_B); + return; + case PL_COLOR_TRC_S_LOG2: + GLSL("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f), \n" + " (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " - vec3(%f)) * vec3(1.0/%f), \n" + " lessThanEqual(vec3(%f), color.rgb)); \n", + SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q); + return; + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_COUNT: + break; + } + + pl_unreachable(); + +scale_out: + if (csp_max != 1 || csp_min != 0) { + GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n", + SH_FLOAT(csp_max - csp_min), SH_FLOAT(csp_min)); + } +} + +void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + if (csp->transfer == PL_COLOR_TRC_LINEAR) + return; + + float csp_min, csp_max; + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = csp, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_NORM, + .out_min = &csp_min, + .out_max = &csp_max, + )); + + GLSL("// pl_shader_delinearize \n"); + switch (csp->transfer) { + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_GAMMA18: + case PL_COLOR_TRC_GAMMA20: + case PL_COLOR_TRC_GAMMA22: + case PL_COLOR_TRC_GAMMA24: + case PL_COLOR_TRC_GAMMA26: + case PL_COLOR_TRC_GAMMA28: + case PL_COLOR_TRC_PRO_PHOTO: + case PL_COLOR_TRC_ST428: ; + if (csp_max != 1 || csp_min != 0) { + GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n", + SH_FLOAT(1 / (csp_max - csp_min)), + SH_FLOAT(-csp_min / (csp_max - csp_min))); + } + break; + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_PQ: + case PL_COLOR_TRC_HLG: + case PL_COLOR_TRC_V_LOG: + case PL_COLOR_TRC_S_LOG1: + case PL_COLOR_TRC_S_LOG2: + break; // scene-referred or absolute scale + case PL_COLOR_TRC_COUNT: + pl_unreachable(); + } + + GLSL("color.rgb = max(color.rgb, 0.0); \n"); + + switch (csp->transfer) { + case PL_COLOR_TRC_SRGB: + GLSL("color.rgb = mix(color.rgb * vec3(12.92), \n" + " vec3(1.055) * pow(color.rgb, vec3(1.0/2.4)) \n" + " - vec3(0.055), \n" + " lessThanEqual(vec3(0.0031308), color.rgb)); \n"); + return; + case PL_COLOR_TRC_BT_1886: { + const float lb = powf(csp_min, 1/2.4f); + const float lw = powf(csp_max, 1/2.4f); + const float a = powf(lw - lb, 2.4f); + const float b = lb / (lw - lb); + GLSL("color.rgb = pow("$" * color.rgb, vec3(1.0/2.4)) - vec3("$"); \n", + SH_FLOAT(1.0 / a), SH_FLOAT(b)); + return; + } + case PL_COLOR_TRC_GAMMA18: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/1.8));\n"); + return; + case PL_COLOR_TRC_GAMMA20: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.0));\n"); + return; + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_GAMMA22: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.2));\n"); + return; + case PL_COLOR_TRC_GAMMA24: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.4));\n"); + return; + case PL_COLOR_TRC_GAMMA26: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.6));\n"); + return; + case PL_COLOR_TRC_GAMMA28: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.8));\n"); + return; + case PL_COLOR_TRC_ST428: + GLSL("color.rgb = pow(color.rgb * vec3(48.0/52.37), vec3(1.0/2.6));\n"); + return; + case PL_COLOR_TRC_PRO_PHOTO: + GLSL("color.rgb = mix(color.rgb * vec3(16.0), \n" + " pow(color.rgb, vec3(1.0/1.8)), \n" + " lessThanEqual(vec3(0.001953), color.rgb)); \n"); + return; + case PL_COLOR_TRC_PQ: + GLSL("color.rgb *= vec3(1.0/%f); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + 10000 / PL_COLOR_SDR_WHITE, PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + return; + case PL_COLOR_TRC_HLG: { + const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1); + const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y)); + // OOTF^-1 + GLSL("color.rgb *= 1.0 / "$"; \n" + "color.rgb *= 12.0 * max(1e-6, pow(dot("$", color.rgb), "$")); \n", + SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT((1 - y) / y)); + // OETF + GLSL("color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n" + " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n" + " lessThan(vec3(1.0), color.rgb)); \n" + "color.rgb = "$" * color.rgb + vec3("$"); \n", + HLG_A, HLG_B, HLG_C, + SH_FLOAT(1 / (1 - b)), SH_FLOAT(-b / (1 - b))); + return; + } + case PL_COLOR_TRC_V_LOG: + GLSL("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125), \n" + " vec3(%f) * log(color.rgb + vec3(%f)) \n" + " + vec3(%f), \n" + " lessThanEqual(vec3(0.01), color.rgb)); \n", + VLOG_C / M_LN10, VLOG_B, VLOG_D); + return; + case PL_COLOR_TRC_S_LOG1: + GLSL("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n", + SLOG_A / M_LN10, SLOG_B, SLOG_C); + return; + case PL_COLOR_TRC_S_LOG2: + GLSL("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f), \n" + " vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n" + " + vec3(%f), \n" + " lessThanEqual(vec3(0.0), color.rgb)); \n", + SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C); + return; + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_COUNT: + break; + } + + pl_unreachable(); +} + +const struct pl_sigmoid_params pl_sigmoid_default_params = { PL_SIGMOID_DEFAULTS }; + +void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + params = PL_DEF(params, &pl_sigmoid_default_params); + float center = PL_DEF(params->center, pl_sigmoid_default_params.center); + float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope); + + // This function needs to go through (0,0) and (1,1), so we compute the + // values at 1 and 0, and then scale/shift them, respectively. + float offset = 1.0 / (1 + expf(slope * center)); + float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset; + + GLSL("// pl_shader_sigmoidize \n" + "color = clamp(color, 0.0, 1.0); \n" + "color = vec4("$") - vec4("$") * \n" + " log(vec4(1.0) / (color * vec4("$") + vec4("$")) \n" + " - vec4(1.0)); \n", + SH_FLOAT(center), SH_FLOAT(1.0 / slope), + SH_FLOAT(scale), SH_FLOAT(offset)); +} + +void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + // See: pl_shader_sigmoidize + params = PL_DEF(params, &pl_sigmoid_default_params); + float center = PL_DEF(params->center, pl_sigmoid_default_params.center); + float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope); + float offset = 1.0 / (1 + expf(slope * center)); + float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset; + + GLSL("// pl_shader_unsigmoidize \n" + "color = clamp(color, 0.0, 1.0); \n" + "color = vec4("$") / \n" + " (vec4(1.0) + exp(vec4("$") * (vec4("$") - color))) \n" + " - vec4("$"); \n", + SH_FLOAT(1.0 / scale), + SH_FLOAT(slope), SH_FLOAT(center), + SH_FLOAT(offset / scale)); +} + +const struct pl_peak_detect_params pl_peak_detect_default_params = { PL_PEAK_DETECT_DEFAULTS }; +const struct pl_peak_detect_params pl_peak_detect_high_quality_params = { PL_PEAK_DETECT_HQ_DEFAULTS }; + +static bool peak_detect_params_eq(const struct pl_peak_detect_params *a, + const struct pl_peak_detect_params *b) +{ + return a->smoothing_period == b->smoothing_period && + a->scene_threshold_low == b->scene_threshold_low && + a->scene_threshold_high == b->scene_threshold_high && + a->percentile == b->percentile; + // don't compare `allow_delayed` because it doesn't change measurement +} + +enum { + // Split the peak buffer into several independent slices to reduce pressure + // on global atomics + SLICES = 12, + + // How many bits to use for storing PQ data. Be careful when setting this + // too high, as it may overflow `unsigned int` on large video sources. + // + // The value chosen is enough to guarantee no overflow for an 8K x 4K frame + // consisting entirely of 100% 10k nits PQ values, with 16x16 workgroups. + PQ_BITS = 14, + PQ_MAX = (1 << PQ_BITS) - 1, + + // How many bits to use for the histogram. We bias the histogram down + // by half the PQ range (~90 nits), effectively clumping the SDR part + // of the image into a single histogram bin. + HIST_BITS = 7, + HIST_BIAS = 1 << (HIST_BITS - 1), + HIST_BINS = (1 << HIST_BITS) - HIST_BIAS, + + // Convert from histogram bin to (starting) PQ value +#define HIST_PQ(bin) (((bin) + HIST_BIAS) << (PQ_BITS - HIST_BITS)) +}; + + +pl_static_assert(PQ_BITS >= HIST_BITS); + +struct peak_buf_data { + unsigned frame_wg_count[SLICES]; // number of work groups processed + unsigned frame_wg_active[SLICES];// number of active (nonzero) work groups + unsigned frame_sum_pq[SLICES]; // sum of PQ Y values over all WGs (PQ_BITS) + unsigned frame_max_pq[SLICES]; // maximum PQ Y value among these WGs (PQ_BITS) + unsigned frame_hist[SLICES][HIST_BINS]; // always allocated, conditionally used +}; + +static const struct pl_buffer_var peak_buf_vars[] = { +#define VAR(field) { \ + .var = { \ + .name = #field, \ + .type = PL_VAR_UINT, \ + .dim_v = 1, \ + .dim_m = 1, \ + .dim_a = sizeof(((struct peak_buf_data *) NULL)->field) / \ + sizeof(unsigned), \ + }, \ + .layout = { \ + .offset = offsetof(struct peak_buf_data, field), \ + .size = sizeof(((struct peak_buf_data *) NULL)->field), \ + .stride = sizeof(unsigned), \ + }, \ +} + VAR(frame_wg_count), + VAR(frame_wg_active), + VAR(frame_sum_pq), + VAR(frame_max_pq), + VAR(frame_hist), +#undef VAR +}; + +struct sh_color_map_obj { + // Tone map state + struct { + struct pl_tone_map_params params; + pl_shader_obj lut; + } tone; + + // Gamut map state + struct { + pl_shader_obj lut; + } gamut; + + // Peak detection state + struct { + struct pl_peak_detect_params params; // currently active parameters + pl_buf buf; // pending peak detection buffer + pl_buf readback; // readback buffer (fallback) + float avg_pq; // current (smoothed) values + float max_pq; + } peak; +}; + +// Excluding size, since this is checked by sh_lut +static uint64_t gamut_map_signature(const struct pl_gamut_map_params *par) +{ + uint64_t sig = CACHE_KEY_GAMUT_LUT; + pl_hash_merge(&sig, pl_str0_hash(par->function->name)); + pl_hash_merge(&sig, pl_var_hash(par->input_gamut)); + pl_hash_merge(&sig, pl_var_hash(par->output_gamut)); + pl_hash_merge(&sig, pl_var_hash(par->min_luma)); + pl_hash_merge(&sig, pl_var_hash(par->max_luma)); + pl_hash_merge(&sig, pl_var_hash(par->constants)); + return sig; +} + +static void sh_color_map_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_color_map_obj *obj = ptr; + pl_shader_obj_destroy(&obj->tone.lut); + pl_shader_obj_destroy(&obj->gamut.lut); + pl_buf_destroy(gpu, &obj->peak.buf); + pl_buf_destroy(gpu, &obj->peak.readback); + memset(obj, 0, sizeof(*obj)); +} + +static inline float iir_coeff(float rate) +{ + if (!rate) + return 1.0f; + return 1.0f - expf(-1.0f / rate); +} + +static float measure_peak(const struct peak_buf_data *data, float percentile) +{ + unsigned frame_max_pq = data->frame_max_pq[0]; + for (int k = 1; k < SLICES; k++) + frame_max_pq = PL_MAX(frame_max_pq, data->frame_max_pq[k]); + const float frame_max = (float) frame_max_pq / PQ_MAX; + if (percentile <= 0 || percentile >= 100) + return frame_max; + unsigned total_pixels = 0; + for (int k = 0; k < SLICES; k++) { + for (int i = 0; i < HIST_BINS; i++) + total_pixels += data->frame_hist[k][i]; + } + if (!total_pixels) // no histogram data available? + return frame_max; + + const unsigned target_pixel = ceilf(percentile / 100.0f * total_pixels); + if (target_pixel >= total_pixels) + return frame_max; + + unsigned sum = 0; + for (int i = 0; i < HIST_BINS; i++) { + unsigned next = sum; + for (int k = 0; k < SLICES; k++) + next += data->frame_hist[k][i]; + if (next < target_pixel) { + sum = next; + continue; + } + + // Upper and lower frequency boundaries of the matching histogram bin + const unsigned count_low = sum; // last pixel of previous bin + const unsigned count_high = next + 1; // first pixel of next bin + pl_assert(count_low < target_pixel && target_pixel < count_high); + + // PQ luminance associated with count_low/high respectively + const float pq_low = (float) HIST_PQ(i) / PQ_MAX; + float pq_high = (float) HIST_PQ(i + 1) / PQ_MAX; + if (count_high > total_pixels) // special case for last histogram bin + pq_high = frame_max; + + // Position of `target_pixel` inside this bin, assumes pixels are + // equidistributed inside a histogram bin + const float ratio = (float) (target_pixel - count_low) / + (count_high - count_low); + return PL_MIX(pq_low, pq_high, ratio); + } + + pl_unreachable(); +} + +// if `force` is true, ensures the buffer is read, even if `allow_delayed` +static void update_peak_buf(pl_gpu gpu, struct sh_color_map_obj *obj, bool force) +{ + const struct pl_peak_detect_params *params = &obj->peak.params; + if (!obj->peak.buf) + return; + + if (!force && params->allow_delayed && pl_buf_poll(gpu, obj->peak.buf, 0)) + return; // buffer not ready yet + + bool ok; + struct peak_buf_data data = {0}; + if (obj->peak.readback) { + pl_buf_copy(gpu, obj->peak.readback, 0, obj->peak.buf, 0, sizeof(data)); + ok = pl_buf_read(gpu, obj->peak.readback, 0, &data, sizeof(data)); + } else { + ok = pl_buf_read(gpu, obj->peak.buf, 0, &data, sizeof(data)); + } + if (ok && data.frame_wg_count[0] > 0) { + // Peak detection completed successfully + pl_buf_destroy(gpu, &obj->peak.buf); + } else { + // No data read? Possibly this peak obj has not been executed yet + if (!ok) { + PL_ERR(gpu, "Failed reading peak detection buffer!"); + } else if (params->allow_delayed) { + PL_TRACE(gpu, "Peak detection buffer not yet ready, ignoring.."); + } else { + PL_WARN(gpu, "Peak detection usage error: attempted detecting peak " + "and using detected peak in the same shader program, " + "but `params->allow_delayed` is false! Ignoring, but " + "expect incorrect output."); + } + if (force || !ok) + pl_buf_destroy(gpu, &obj->peak.buf); + return; + } + + uint64_t frame_sum_pq = 0u, frame_wg_count = 0u, frame_wg_active = 0u; + for (int k = 0; k < SLICES; k++) { + frame_sum_pq += data.frame_sum_pq[k]; + frame_wg_count += data.frame_wg_count[k]; + frame_wg_active += data.frame_wg_active[k]; + } + float avg_pq, max_pq; + if (frame_wg_active) { + avg_pq = (float) frame_sum_pq / (frame_wg_active * PQ_MAX); + max_pq = measure_peak(&data, params->percentile); + } else { + // Solid black frame + avg_pq = max_pq = PL_COLOR_HDR_BLACK; + } + + if (!obj->peak.avg_pq) { + // Set the initial value accordingly if it contains no data + obj->peak.avg_pq = avg_pq; + obj->peak.max_pq = max_pq; + } else { + // Ignore small deviations from existing peak (rounding error) + static const float epsilon = 1.0f / PQ_MAX; + if (fabsf(avg_pq - obj->peak.avg_pq) < epsilon) + avg_pq = obj->peak.avg_pq; + if (fabsf(max_pq - obj->peak.max_pq) < epsilon) + max_pq = obj->peak.max_pq; + } + + // Use an IIR low-pass filter to smooth out the detected values + const float coeff = iir_coeff(params->smoothing_period); + obj->peak.avg_pq += coeff * (avg_pq - obj->peak.avg_pq); + obj->peak.max_pq += coeff * (max_pq - obj->peak.max_pq); + + // Scene change hysteresis + if (params->scene_threshold_low > 0 && params->scene_threshold_high > 0) { + const float log10_pq = 1e-2f; // experimentally determined approximate + const float thresh_low = params->scene_threshold_low * log10_pq; + const float thresh_high = params->scene_threshold_high * log10_pq; + const float bias = (float) frame_wg_active / frame_wg_count; + const float delta = bias * fabsf(avg_pq - obj->peak.avg_pq); + const float mix_coeff = pl_smoothstep(thresh_low, thresh_high, delta); + obj->peak.avg_pq = PL_MIX(obj->peak.avg_pq, avg_pq, mix_coeff); + obj->peak.max_pq = PL_MIX(obj->peak.max_pq, max_pq, mix_coeff); + } +} + +bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp, + pl_shader_obj *state, + const struct pl_peak_detect_params *params) +{ + params = PL_DEF(params, &pl_peak_detect_default_params); + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return false; + + pl_gpu gpu = SH_GPU(sh); + if (!gpu || gpu->limits.max_ssbo_size < sizeof(struct peak_buf_data)) { + PL_ERR(sh, "HDR peak detection requires a GPU with support for at " + "least %zu bytes of SSBO data (supported: %zu)", + sizeof(struct peak_buf_data), gpu ? gpu->limits.max_ssbo_size : 0); + return false; + } + + const bool use_histogram = params->percentile > 0 && params->percentile < 100; + size_t shmem_req = 3 * sizeof(uint32_t); + if (use_histogram) + shmem_req += sizeof(uint32_t[HIST_BINS]); + + if (!sh_try_compute(sh, 16, 16, true, shmem_req)) { + PL_ERR(sh, "HDR peak detection requires compute shaders with support " + "for at least %zu bytes of shared memory! (avail: %zu)", + shmem_req, sh_glsl(sh).max_shmem_size); + return false; + } + + struct sh_color_map_obj *obj; + obj = SH_OBJ(sh, state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj, + sh_color_map_uninit); + if (!obj) + return false; + + if (peak_detect_params_eq(&obj->peak.params, params)) { + update_peak_buf(gpu, obj, true); // prevent over-writing previous frame + } else { + pl_reset_detected_peak(*state); + } + + pl_assert(!obj->peak.buf); + static const struct peak_buf_data zero = {0}; + +retry_ssbo: + if (obj->peak.readback) { + obj->peak.buf = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(struct peak_buf_data), + .storable = true, + .initial_data = &zero, + )); + } else { + obj->peak.buf = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(struct peak_buf_data), + .memory_type = PL_BUF_MEM_DEVICE, + .host_readable = true, + .storable = true, + .initial_data = &zero, + )); + } + + if (!obj->peak.buf && !obj->peak.readback) { + PL_WARN(sh, "Failed creating host-readable peak detection SSBO, " + "retrying with fallback buffer"); + obj->peak.readback = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(struct peak_buf_data), + .host_readable = true, + )); + if (obj->peak.readback) + goto retry_ssbo; + } + + if (!obj->peak.buf) { + SH_FAIL(sh, "Failed creating peak detection SSBO!"); + return false; + } + + obj->peak.params = *params; + + sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = "PeakBuf", + .type = PL_DESC_BUF_STORAGE, + .access = PL_DESC_ACCESS_READWRITE, + }, + .binding.object = obj->peak.buf, + .buffer_vars = (struct pl_buffer_var *) peak_buf_vars, + .num_buffer_vars = PL_ARRAY_SIZE(peak_buf_vars), + }); + + sh_describe(sh, "peak detection"); + GLSL("// pl_shader_detect_peak \n" + "{ \n" + "const uint wg_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y; \n" + "uint wg_idx = gl_WorkGroupID.y * gl_NumWorkGroups.x + \n" + " gl_WorkGroupID.x; \n" + "uint slice = wg_idx %% %du; \n" + "vec4 color_orig = color; \n", + SLICES); + + // For performance, we want to do as few atomic operations on global + // memory as possible, so use an atomic in shmem for the work group. + ident_t wg_sum = sh_fresh(sh, "wg_sum"), + wg_max = sh_fresh(sh, "wg_max"), + wg_black = sh_fresh(sh, "wg_black"), + wg_hist = NULL_IDENT; + GLSLH("shared uint "$", "$", "$"; \n", wg_sum, wg_max, wg_black); + if (use_histogram) { + wg_hist = sh_fresh(sh, "wg_hist"); + GLSLH("shared uint "$"[%u]; \n", wg_hist, HIST_BINS); + GLSL("for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n" + " "$"[i] = 0u; \n", + HIST_BINS, wg_hist); + } + GLSL($" = 0u; "$" = 0u; "$" = 0u; \n" + "barrier(); \n", + wg_sum, wg_max, wg_black); + + // Decode color into linear light representation + pl_color_space_infer(&csp); + pl_shader_linearize(sh, &csp); + + // Measure luminance as N-bit PQ + GLSL("float luma = dot("$", color.rgb); \n" + "luma *= %f; \n" + "luma = pow(clamp(luma, 0.0, 1.0), %f); \n" + "luma = (%f + %f * luma) / (1.0 + %f * luma); \n" + "luma = pow(luma, %f); \n" + "luma *= smoothstep(0.0, 1e-2, luma); \n" + "uint y_pq = uint(%d.0 * luma); \n", + sh_luma_coeffs(sh, &csp), + PL_COLOR_SDR_WHITE / 10000.0, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2, + PQ_MAX); + + // Update the work group's shared atomics + bool has_subgroups = sh_glsl(sh).subgroup_size > 0; + if (use_histogram) { + GLSL("int bin = (int(y_pq) >> %d) - %d; \n" + "bin = clamp(bin, 0, %d); \n", + PQ_BITS - HIST_BITS, HIST_BIAS, + HIST_BINS - 1); + if (has_subgroups) { + // Optimize for the very common case of identical histogram bins + GLSL("if (subgroupAllEqual(bin)) { \n" + " if (subgroupElect()) \n" + " atomicAdd("$"[bin], gl_SubgroupSize); \n" + "} else { \n" + " atomicAdd("$"[bin], 1u); \n" + "} \n", + wg_hist, wg_hist); + } else { + GLSL("atomicAdd("$"[bin], 1u); \n", wg_hist); + } + } + + if (has_subgroups) { + GLSL("uint group_sum = subgroupAdd(y_pq); \n" + "uint group_max = subgroupMax(y_pq); \n" + "uvec4 b = subgroupBallot(y_pq == 0u); \n" + "if (subgroupElect()) { \n" + " atomicAdd("$", group_sum); \n" + " atomicMax("$", group_max); \n" + " atomicAdd("$", subgroupBallotBitCount(b));\n" + "} \n" + "barrier(); \n", + wg_sum, wg_max, wg_black); + } else { + GLSL("atomicAdd("$", y_pq); \n" + "atomicMax("$", y_pq); \n" + "if (y_pq == 0u) \n" + " atomicAdd("$", 1u); \n" + "barrier(); \n", + wg_sum, wg_max, wg_black); + } + + if (use_histogram) { + GLSL("if (gl_LocalInvocationIndex == 0u) \n" + " "$"[0] -= "$"; \n" + "for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n" + " atomicAdd(frame_hist[slice * %du + i], "$"[i]); \n", + wg_hist, wg_black, + HIST_BINS, + HIST_BINS, wg_hist); + } + + // Have one thread per work group update the global atomics + GLSL("if (gl_LocalInvocationIndex == 0u) { \n" + " uint num = wg_size - "$"; \n" + " atomicAdd(frame_wg_count[slice], 1u); \n" + " atomicAdd(frame_wg_active[slice], min(num, 1u)); \n" + " if (num > 0u) { \n" + " atomicAdd(frame_sum_pq[slice], "$" / num); \n" + " atomicMax(frame_max_pq[slice], "$"); \n" + " } \n" + "} \n" + "color = color_orig; \n" + "} \n", + wg_black, wg_sum, wg_max); + + return true; +} + +bool pl_get_detected_hdr_metadata(const pl_shader_obj state, + struct pl_hdr_metadata *out) +{ + if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP) + return false; + + struct sh_color_map_obj *obj = state->priv; + update_peak_buf(state->gpu, obj, false); + if (!obj->peak.avg_pq) + return false; + + out->max_pq_y = obj->peak.max_pq; + out->avg_pq_y = obj->peak.avg_pq; + return true; +} + +bool pl_get_detected_peak(const pl_shader_obj state, + float *out_peak, float *out_avg) +{ + struct pl_hdr_metadata data; + if (!pl_get_detected_hdr_metadata(state, &data)) + return false; + + // Preserves old behavior + *out_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.max_pq_y); + *out_avg = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.avg_pq_y); + return true; +} + +void pl_reset_detected_peak(pl_shader_obj state) +{ + if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP) + return; + + struct sh_color_map_obj *obj = state->priv; + pl_buf readback = obj->peak.readback; + pl_buf_destroy(state->gpu, &obj->peak.buf); + memset(&obj->peak, 0, sizeof(obj->peak)); + obj->peak.readback = readback; +} + +void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + sh_describe(sh, "feature extraction"); + pl_shader_linearize(sh, &csp); + GLSL("// pl_shader_extract_features \n" + "{ \n" + "vec3 lms = %f * "$" * color.rgb; \n" + "lms = pow(max(lms, 0.0), vec3(%f)); \n" + "lms = (vec3(%f) + %f * lms) \n" + " / (vec3(1.0) + %f * lms); \n" + "lms = pow(lms, vec3(%f)); \n" + "float I = dot(vec3(%f, %f, %f), lms); \n" + "color = vec4(I, 0.0, 0.0, 1.0); \n" + "} \n", + PL_COLOR_SDR_WHITE / 10000, + SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))), + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2, + pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]); +} + +const struct pl_color_map_params pl_color_map_default_params = { PL_COLOR_MAP_DEFAULTS }; +const struct pl_color_map_params pl_color_map_high_quality_params = { PL_COLOR_MAP_HQ_DEFAULTS }; + +static ident_t rect_pos(pl_shader sh, pl_rect2df rc) +{ + if (!rc.x0 && !rc.x1) + rc.x1 = 1.0f; + if (!rc.y0 && !rc.y1) + rc.y1 = 1.0f; + + return sh_attr_vec2(sh, "tone_map_coords", &(pl_rect2df) { + .x0 = -rc.x0 / (rc.x1 - rc.x0), + .x1 = (1.0f - rc.x0) / (rc.x1 - rc.x0), + .y0 = -rc.y1 / (rc.y0 - rc.y1), + .y1 = (1.0f - rc.y1) / (rc.y0 - rc.y1), + }); +} + +static void visualize_tone_map(pl_shader sh, pl_rect2df rc, float alpha, + const struct pl_tone_map_params *params) +{ + pl_assert(params->input_scaling == PL_HDR_PQ); + pl_assert(params->output_scaling == PL_HDR_PQ); + + GLSL("// Visualize tone mapping \n" + "{ \n" + "vec2 pos = "$"; \n" + "if (min(pos.x, pos.y) >= 0.0 && \n" // visualizer rect + " max(pos.x, pos.y) <= 1.0) \n" + "{ \n" + "float xmin = "$"; \n" + "float xmax = "$"; \n" + "float xavg = "$"; \n" + "float ymin = "$"; \n" + "float ymax = "$"; \n" + "float alpha = 0.8 * "$"; \n" + "vec3 viz = color.rgb; \n" + "float vv = tone_map(pos.x); \n" + // Color based on region + "if (pos.x < xmin || pos.x > xmax) { \n" // outside source + "} else if (pos.y < ymin || pos.y > ymax) {\n" // outside target + " if (pos.y < xmin || pos.y > xmax) { \n" // and also source + " viz = vec3(0.1, 0.1, 0.5); \n" + " } else { \n" + " viz = vec3(0.2, 0.05, 0.05); \n" // but inside source + " } \n" + "} else { \n" // inside domain + " if (abs(pos.x - pos.y) < 1e-3) { \n" // main diagonal + " viz = vec3(0.2); \n" + " } else if (pos.y < vv) { \n" // inside function + " alpha *= 0.6; \n" + " viz = vec3(0.05); \n" + " if (vv > pos.x && pos.y > pos.x) \n" // output brighter than input + " viz.rg = vec2(0.5, 0.7); \n" + " } else { \n" // outside function + " if (vv < pos.x && pos.y < pos.x) \n" // output darker than input + " viz = vec3(0.0, 0.1, 0.2); \n" + " } \n" + " if (pos.y > xmax) { \n" // inverse tone-mapping region + " vec3 hi = vec3(0.2, 0.5, 0.8); \n" + " viz = mix(viz, hi, 0.5); \n" + " } else if (pos.y < xmin) { \n" // black point region + " viz = mix(viz, vec3(0.0), 0.3); \n" + " } \n" + " if (xavg > 0.0 && abs(pos.x - xavg) < 1e-3)\n" // source avg brightness + " viz = vec3(0.5); \n" + "} \n" + "color.rgb = mix(color.rgb, viz, alpha); \n" + "} \n" + "} \n", + rect_pos(sh, rc), + SH_FLOAT_DYN(params->input_min), + SH_FLOAT_DYN(params->input_max), + SH_FLOAT_DYN(params->input_avg), + SH_FLOAT(params->output_min), + SH_FLOAT_DYN(params->output_max), + SH_FLOAT_DYN(alpha)); +} + +static void visualize_gamut_map(pl_shader sh, pl_rect2df rc, + ident_t lut, float hue, float theta, + const struct pl_gamut_map_params *params) +{ + ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms); + ident_t lms2rgb_src = SH_MAT3(pl_ipt_lms2rgb(¶ms->input_gamut)); + ident_t lms2rgb_dst = SH_MAT3(pl_ipt_lms2rgb(¶ms->output_gamut)); + + GLSL("// Visualize gamut mapping \n" + "vec2 pos = "$"; \n" + "float pqmin = "$"; \n" + "float pqmax = "$"; \n" + "float rgbmin = "$"; \n" + "float rgbmax = "$"; \n" + "vec3 orig = ipt; \n" + "if (min(pos.x, pos.y) >= 0.0 && \n" + " max(pos.x, pos.y) <= 1.0) \n" + "{ \n" + // Source color to visualize + "float mid = mix(pqmin, pqmax, 0.6); \n" + "vec3 base = vec3(0.5, 0.0, 0.0); \n" + "float hue = "$", theta = "$"; \n" + "base.x = mix(base.x, mid, sin(theta)); \n" + "mat3 rot1 = mat3(1.0, 0.0, 0.0, \n" + " 0.0, cos(hue), sin(hue), \n" + " 0.0, -sin(hue), cos(hue)); \n" + "mat3 rot2 = mat3( cos(theta), 0.0, sin(theta), \n" + " 0.0, 1.0, 0.0, \n" + " -sin(theta), 0.0, cos(theta)); \n" + "vec3 dir = vec3(pos.yx - vec2(0.5), 0.0); \n" + "ipt = base + rot1 * rot2 * dir; \n" + // Convert back to RGB (for gamut boundary testing) + "lmspq = "$" * ipt; \n" + "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n" + "lms = max(lms - vec3(%f), 0.0) \n" + " / (vec3(%f) - %f * lms); \n" + "lms = pow(lms, vec3(1.0/%f)); \n" + "lms *= %f; \n" + // Check against src/dst gamut boundaries + "vec3 rgbsrc = "$" * lms; \n" + "vec3 rgbdst = "$" * lms; \n" + "bool insrc, indst; \n" + "insrc = all(lessThan(rgbsrc, vec3(rgbmax))) && \n" + " all(greaterThan(rgbsrc, vec3(rgbmin))); \n" + "indst = all(lessThan(rgbdst, vec3(rgbmax))) && \n" + " all(greaterThan(rgbdst, vec3(rgbmin))); \n" + // Sample from gamut mapping 3DLUT + "idx.x = (ipt.x - pqmin) / (pqmax - pqmin); \n" + "idx.y = 2.0 * length(ipt.yz); \n" + "idx.z = %f * atan(ipt.z, ipt.y) + 0.5; \n" + "vec3 mapped = "$"(idx).xyz; \n" + "mapped.yz -= vec2(32768.0/65535.0); \n" + "float mappedhue = atan(mapped.z, mapped.y); \n" + "float mappedchroma = length(mapped.yz); \n" + "ipt = mapped; \n" + // Visualize gamuts + "if (!insrc && !indst) { \n" + " ipt = orig; \n" + "} else if (insrc && !indst) { \n" + " ipt.x -= 0.1; \n" + "} else if (indst && !insrc) { \n" + " ipt.x += 0.1; \n" + "} \n" + // Visualize iso-luminance and iso-hue lines + "vec3 line; \n" + "if (insrc && fract(50.0 * mapped.x) < 1e-1) { \n" + " float k = smoothstep(0.1, 0.0, abs(sin(theta))); \n" + " line.x = mix(mapped.x, 0.3, 0.5); \n" + " line.yz = sqrt(length(mapped.yz)) * \n" + " normalize(mapped.yz); \n" + " ipt = mix(ipt, line, k); \n" + "} \n" + "if (insrc && fract(10.0 * (mappedhue - hue)) < 1e-1) {\n" + " float k = smoothstep(0.3, 0.0, abs(cos(theta))); \n" + " line.x = mapped.x - 0.05; \n" + " line.yz = 1.2 * mapped.yz; \n" + " ipt = mix(ipt, line, k); \n" + "} \n" + "if (insrc && fract(100.0 * mappedchroma) < 1e-1) { \n" + " line.x = mapped.x + 0.1; \n" + " line.yz = 0.4 * mapped.yz; \n" + " ipt = mix(ipt, line, 0.5); \n" + "} \n" + "} \n", + rect_pos(sh, rc), + SH_FLOAT(params->min_luma), SH_FLOAT(params->max_luma), + SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->min_luma)), + SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->max_luma)), + SH_FLOAT_DYN(hue), SH_FLOAT_DYN(theta), + ipt2lms, + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + 10000 / PL_COLOR_SDR_WHITE, + lms2rgb_src, + lms2rgb_dst, + 0.5f / M_PI, + lut); +} + +static void fill_tone_lut(void *data, const struct sh_lut_params *params) +{ + const struct pl_tone_map_params *lut_params = params->priv; + pl_tone_map_generate(data, lut_params); +} + +static void fill_gamut_lut(void *data, const struct sh_lut_params *params) +{ + const struct pl_gamut_map_params *lut_params = params->priv; + const int lut_size = params->width * params->height * params->depth; + void *tmp = pl_alloc(NULL, lut_size * sizeof(float) * lut_params->lut_stride); + pl_gamut_map_generate(tmp, lut_params); + + // Convert to 16-bit unsigned integer for GPU texture + const float *in = tmp; + uint16_t *out = data; + pl_assert(lut_params->lut_stride == 3); + pl_assert(params->comps == 4); + for (int i = 0; i < lut_size; i++) { + out[0] = roundf(in[0] * UINT16_MAX); + out[1] = roundf(in[1] * UINT16_MAX + (UINT16_MAX >> 1)); + out[2] = roundf(in[2] * UINT16_MAX + (UINT16_MAX >> 1)); + in += 3; + out += 4; + } + + pl_free(tmp); +} + +void pl_shader_color_map_ex(pl_shader sh, const struct pl_color_map_params *params, + const struct pl_color_map_args *args) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + struct pl_color_space src = args->src, dst = args->dst; + pl_color_space_infer_map(&src, &dst); + if (pl_color_space_equal(&src, &dst)) { + if (args->prelinearized) + pl_shader_delinearize(sh, &dst); + return; + } + + struct sh_color_map_obj *obj = NULL; + if (args->state) { + pl_get_detected_hdr_metadata(*args->state, &src.hdr); + obj = SH_OBJ(sh, args->state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj, + sh_color_map_uninit); + if (!obj) + return; + } + + params = PL_DEF(params, &pl_color_map_default_params); + GLSL("// pl_shader_color_map \n" + "{ \n"); + + struct pl_tone_map_params tone = { + .function = PL_DEF(params->tone_mapping_function, &pl_tone_map_clip), + .constants = params->tone_constants, + .param = params->tone_mapping_param, + .input_scaling = PL_HDR_PQ, + .output_scaling = PL_HDR_PQ, + .lut_size = PL_DEF(params->lut_size, pl_color_map_default_params.lut_size), + .hdr = src.hdr, + }; + + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &src, + .metadata = params->metadata, + .scaling = tone.input_scaling, + .out_min = &tone.input_min, + .out_max = &tone.input_max, + .out_avg = &tone.input_avg, + )); + + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &dst, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = tone.output_scaling, + .out_min = &tone.output_min, + .out_max = &tone.output_max, + )); + + pl_tone_map_params_infer(&tone); + + // Round sufficiently similar values + if (fabs(tone.input_max - tone.output_max) < 1e-6) + tone.output_max = tone.input_max; + if (fabs(tone.input_min - tone.output_min) < 1e-6) + tone.output_min = tone.input_min; + + if (!params->inverse_tone_mapping) { + // Never exceed the source unless requested, but still allow + // black point adaptation + tone.output_max = PL_MIN(tone.output_max, tone.input_max); + } + + const int *lut3d_size_def = pl_color_map_default_params.lut3d_size; + struct pl_gamut_map_params gamut = { + .function = PL_DEF(params->gamut_mapping, &pl_gamut_map_clip), + .constants = params->gamut_constants, + .input_gamut = src.hdr.prim, + .output_gamut = dst.hdr.prim, + .lut_size_I = PL_DEF(params->lut3d_size[0], lut3d_size_def[0]), + .lut_size_C = PL_DEF(params->lut3d_size[1], lut3d_size_def[1]), + .lut_size_h = PL_DEF(params->lut3d_size[2], lut3d_size_def[2]), + .lut_stride = 3, + }; + + float src_peak_static; + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &src, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_PQ, + .out_max = &src_peak_static, + )); + + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &dst, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_PQ, + .out_min = &gamut.min_luma, + .out_max = &gamut.max_luma, + )); + + // Clip the gamut mapping output to the input gamut if disabled + if (!params->gamut_expansion && gamut.function->bidirectional) { + if (pl_primaries_compatible(&gamut.input_gamut, &gamut.output_gamut)) { + gamut.output_gamut = pl_primaries_clip(&gamut.output_gamut, + &gamut.input_gamut); + } + } + + // Backwards compatibility with older API + switch (params->gamut_mode) { + case PL_GAMUT_CLIP: + switch (params->intent) { + case PL_INTENT_AUTO: + case PL_INTENT_PERCEPTUAL: + case PL_INTENT_RELATIVE_COLORIMETRIC: + break; // leave default + case PL_INTENT_SATURATION: + gamut.function = &pl_gamut_map_saturation; + break; + case PL_INTENT_ABSOLUTE_COLORIMETRIC: + gamut.function = &pl_gamut_map_absolute; + break; + } + break; + case PL_GAMUT_DARKEN: + gamut.function = &pl_gamut_map_darken; + break; + case PL_GAMUT_WARN: + gamut.function = &pl_gamut_map_highlight; + break; + case PL_GAMUT_DESATURATE: + gamut.function = &pl_gamut_map_desaturate; + break; + case PL_GAMUT_MODE_COUNT: + pl_unreachable(); + } + + bool can_fast = !params->force_tone_mapping_lut; + if (!args->state) { + // No state object provided, forcibly disable advanced methods + can_fast = true; + if (tone.function != &pl_tone_map_clip) + tone.function = &pl_tone_map_linear; + if (gamut.function != &pl_gamut_map_clip) + gamut.function = &pl_gamut_map_saturation; + } + + pl_fmt gamut_fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR); + if (!gamut_fmt) { + gamut.function = &pl_gamut_map_saturation; + can_fast = true; + } + + bool need_tone_map = !pl_tone_map_params_noop(&tone); + bool need_gamut_map = !pl_gamut_map_params_noop(&gamut); + + if (!args->prelinearized) + pl_shader_linearize(sh, &src); + + pl_matrix3x3 rgb2lms = pl_ipt_rgb2lms(pl_raw_primaries_get(src.primaries)); + pl_matrix3x3 lms2rgb = pl_ipt_lms2rgb(pl_raw_primaries_get(dst.primaries)); + ident_t lms2ipt = SH_MAT3(pl_ipt_lms2ipt); + ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms); + + if (need_gamut_map && gamut.function == &pl_gamut_map_saturation && can_fast) { + const pl_matrix3x3 lms2src = pl_ipt_lms2rgb(&gamut.input_gamut); + const pl_matrix3x3 dst2lms = pl_ipt_rgb2lms(&gamut.output_gamut); + sh_describe(sh, "gamut map (saturation)"); + pl_matrix3x3_mul(&lms2rgb, &dst2lms); + pl_matrix3x3_mul(&lms2rgb, &lms2src); + need_gamut_map = false; + } + + // Fast path: simply convert between primaries (if needed) + if (!need_tone_map && !need_gamut_map) { + if (src.primaries != dst.primaries) { + sh_describe(sh, "colorspace conversion"); + pl_matrix3x3_mul(&lms2rgb, &rgb2lms); + GLSL("color.rgb = "$" * color.rgb; \n", SH_MAT3(lms2rgb)); + } + goto done; + } + + // Full path: convert input from normalized RGB to IPT + GLSL("vec3 lms = "$" * color.rgb; \n" + "vec3 lmspq = %f * lms; \n" + "lmspq = pow(max(lmspq, 0.0), vec3(%f)); \n" + "lmspq = (vec3(%f) + %f * lmspq) \n" + " / (vec3(1.0) + %f * lmspq); \n" + "lmspq = pow(lmspq, vec3(%f)); \n" + "vec3 ipt = "$" * lmspq; \n" + "float i_orig = ipt.x; \n", + SH_MAT3(rgb2lms), + PL_COLOR_SDR_WHITE / 10000, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2, + lms2ipt); + + if (params->show_clipping) { + const float eps = 1e-6f; + GLSL("bool clip_hi, clip_lo; \n" + "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n" + "clip_lo = any(lessThan(color.rgb, vec3("$"))); \n" + "clip_hi = clip_hi || ipt.x > "$"; \n" + "clip_lo = clip_lo || ipt.x < "$"; \n", + SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps), + SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps), + SH_FLOAT_DYN(tone.input_max + eps), + SH_FLOAT(tone.input_min - eps)); + } + + if (need_tone_map) { + const struct pl_tone_map_function *fun = tone.function; + sh_describef(sh, "%s tone map (%.0f -> %.0f)", fun->name, + pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.input_max), + pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.output_max)); + + if (fun == &pl_tone_map_clip && can_fast) { + + GLSL("#define tone_map(x) clamp((x), "$", "$") \n", + SH_FLOAT(tone.input_min), + SH_FLOAT_DYN(tone.input_max)); + + } else if (fun == &pl_tone_map_linear && can_fast) { + + const float gain = tone.constants.exposure; + const float scale = tone.input_max - tone.input_min; + + ident_t linfun = sh_fresh(sh, "linear_pq"); + GLSLH("float "$"(float x) { \n" + // Stretch the input range (while clipping) + " x = "$" * x + "$"; \n" + " x = clamp(x, 0.0, 1.0); \n" + " x = "$" * x + "$"; \n" + " return x; \n" + "} \n", + linfun, + SH_FLOAT_DYN(gain / scale), + SH_FLOAT_DYN(-gain / scale * tone.input_min), + SH_FLOAT_DYN(tone.output_max - tone.output_min), + SH_FLOAT(tone.output_min)); + + GLSL("#define tone_map(x) ("$"(x)) \n", linfun); + + } else { + + pl_assert(obj); + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->tone.lut, + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_AUTO, + .method = SH_LUT_LINEAR, + .width = tone.lut_size, + .comps = 1, + .update = !pl_tone_map_params_equal(&tone, &obj->tone.params), + .dynamic = tone.input_avg > 0, // dynamic metadata + .fill = fill_tone_lut, + .priv = &tone, + )); + obj->tone.params = tone; + if (!lut) { + SH_FAIL(sh, "Failed generating tone-mapping LUT!"); + return; + } + + const float lut_range = tone.input_max - tone.input_min; + GLSL("#define tone_map(x) ("$"("$" * (x) + "$")) \n", + lut, SH_FLOAT_DYN(1.0f / lut_range), + SH_FLOAT_DYN(-tone.input_min / lut_range)); + + } + + bool need_recovery = tone.input_max >= tone.output_max; + if (need_recovery && params->contrast_recovery && args->feature_map) { + ident_t pos, pt; + ident_t lowres = sh_bind(sh, args->feature_map, PL_TEX_ADDRESS_CLAMP, + PL_TEX_SAMPLE_LINEAR, "feature_map", + NULL, &pos, &pt); + + // Obtain HF detail map from bicubic interpolation of LF features + GLSL("vec2 lpos = "$"; \n" + "vec2 lpt = "$"; \n" + "vec2 lsize = vec2(textureSize("$", 0)); \n" + "vec2 frac = fract(lpos * lsize + vec2(0.5)); \n" + "vec2 frac2 = frac * frac; \n" + "vec2 inv = vec2(1.0) - frac; \n" + "vec2 inv2 = inv * inv; \n" + "vec2 w0 = 1.0/6.0 * inv2 * inv; \n" + "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \n" + "vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n" + "vec2 w3 = 1.0/6.0 * frac2 * frac; \n" + "vec4 g = vec4(w0 + w1, w2 + w3); \n" + "vec4 h = vec4(w1, w3) / g + inv.xyxy; \n" + "h.xy -= vec2(2.0); \n" + "vec4 p = lpos.xyxy + lpt.xyxy * h; \n" + "float l00 = textureLod("$", p.xy, 0.0).r; \n" + "float l01 = textureLod("$", p.xw, 0.0).r; \n" + "float l0 = mix(l01, l00, g.y); \n" + "float l10 = textureLod("$", p.zy, 0.0).r; \n" + "float l11 = textureLod("$", p.zw, 0.0).r; \n" + "float l1 = mix(l11, l10, g.y); \n" + "float luma = mix(l1, l0, g.x); \n" + // Mix low-resolution tone mapped image with high-resolution + // tone mapped image according to desired strength. + "float highres = clamp(ipt.x, 0.0, 1.0); \n" + "float lowres = clamp(luma, 0.0, 1.0); \n" + "float detail = highres - lowres; \n" + "float base = tone_map(highres); \n" + "float sharp = tone_map(lowres) + detail; \n" + "ipt.x = clamp(mix(base, sharp, "$"), "$", "$"); \n", + pos, pt, lowres, + lowres, lowres, lowres, lowres, + SH_FLOAT(params->contrast_recovery), + SH_FLOAT(tone.output_min), SH_FLOAT_DYN(tone.output_max)); + + } else { + + GLSL("ipt.x = tone_map(ipt.x); \n"); + } + + // Avoid raising saturation excessively when raising brightness, and + // also desaturate when reducing brightness greatly to account for the + // reduction in gamut volume. + GLSL("vec2 hull = vec2(i_orig, ipt.x); \n" + "hull = ((hull - 6.0) * hull + 9.0) * hull; \n" + "ipt.yz *= min(i_orig / ipt.x, hull.y / hull.x); \n"); + } + + if (need_gamut_map) { + const struct pl_gamut_map_function *fun = gamut.function; + sh_describef(sh, "gamut map (%s)", fun->name); + + pl_assert(obj); + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->gamut.lut, + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .fmt = gamut_fmt, + .method = params->lut3d_tricubic ? SH_LUT_CUBIC : SH_LUT_LINEAR, + .width = gamut.lut_size_I, + .height = gamut.lut_size_C, + .depth = gamut.lut_size_h, + .comps = 4, + .signature = gamut_map_signature(&gamut), + .cache = SH_CACHE(sh), + .fill = fill_gamut_lut, + .priv = &gamut, + )); + if (!lut) { + SH_FAIL(sh, "Failed generating gamut-mapping LUT!"); + return; + } + + // 3D LUT lookup (in ICh space) + const float lut_range = gamut.max_luma - gamut.min_luma; + GLSL("vec3 idx; \n" + "idx.x = "$" * ipt.x + "$"; \n" + "idx.y = 2.0 * length(ipt.yz); \n" + "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;\n" + "ipt = "$"(idx).xyz; \n" + "ipt.yz -= vec2(32768.0/65535.0); \n", + SH_FLOAT(1.0f / lut_range), + SH_FLOAT(-gamut.min_luma / lut_range), + 0.5f / M_PI, lut); + + if (params->show_clipping) { + GLSL("clip_lo = clip_lo || any(lessThan(idx, vec3(0.0))); \n" + "clip_hi = clip_hi || any(greaterThan(idx, vec3(1.0))); \n"); + } + + if (params->visualize_lut) { + visualize_gamut_map(sh, params->visualize_rect, lut, + params->visualize_hue, params->visualize_theta, + &gamut); + } + } + + // Convert IPT back to linear RGB + GLSL("lmspq = "$" * ipt; \n" + "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n" + "lms = max(lms - vec3(%f), 0.0) \n" + " / (vec3(%f) - %f * lms); \n" + "lms = pow(lms, vec3(1.0/%f)); \n" + "lms *= %f; \n" + "color.rgb = "$" * lms; \n", + ipt2lms, + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + 10000 / PL_COLOR_SDR_WHITE, + SH_MAT3(lms2rgb)); + + if (params->show_clipping) { + GLSL("if (clip_hi) { \n" + " float k = dot(color.rgb, vec3(2.0 / 3.0)); \n" + " color.rgb = clamp(vec3(k) - color.rgb, 0.0, 1.0); \n" + " float cmin = min(min(color.r, color.g), color.b); \n" + " float cmax = max(max(color.r, color.g), color.b); \n" + " float delta = cmax - cmin; \n" + " vec3 sat = smoothstep(cmin - 1e-6, cmax, color.rgb); \n" + " const vec3 red = vec3(1.0, 0.0, 0.0); \n" + " color.rgb = mix(red, sat, smoothstep(0.0, 0.3, delta)); \n" + "} else if (clip_lo) { \n" + " vec3 hi = vec3(0.0, 0.3, 0.3); \n" + " color.rgb = mix(color.rgb, hi, 0.5); \n" + "} \n"); + } + + if (need_tone_map) { + if (params->visualize_lut) { + float alpha = need_gamut_map ? powf(cosf(params->visualize_theta), 5.0f) : 1.0f; + visualize_tone_map(sh, params->visualize_rect, alpha, &tone); + } + GLSL("#undef tone_map \n"); + } + +done: + pl_shader_delinearize(sh, &dst); + GLSL("}\n"); +} + +// Backwards compatibility wrapper around `pl_shader_color_map_ex` +void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params, + struct pl_color_space src, struct pl_color_space dst, + pl_shader_obj *state, bool prelinearized) +{ + pl_shader_color_map_ex(sh, params, pl_color_map_args( + .src = src, + .dst = dst, + .prelinearized = prelinearized, + .state = state, + .feature_map = NULL + )); +} + +void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp, + const struct pl_cone_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + if (!params || !params->cones) + return; + + sh_describe(sh, "cone distortion"); + GLSL("// pl_shader_cone_distort\n"); + GLSL("{\n"); + + pl_color_space_infer(&csp); + pl_shader_linearize(sh, &csp); + + pl_matrix3x3 cone_mat; + cone_mat = pl_get_cone_matrix(params, pl_raw_primaries_get(csp.primaries)); + GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("cone_mat"), + .data = PL_TRANSPOSE_3X3(cone_mat.m), + })); + + pl_shader_delinearize(sh, &csp); + GLSL("}\n"); +} diff --git a/src/shaders/custom.c b/src/shaders/custom.c new file mode 100644 index 0000000..3f03e57 --- /dev/null +++ b/src/shaders/custom.c @@ -0,0 +1,89 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" + +#include <libplacebo/shaders/custom.h> + +bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params) +{ + if (params->compute) { + int bw = PL_DEF(params->compute_group_size[0], 16); + int bh = PL_DEF(params->compute_group_size[1], 16); + bool flex = !params->compute_group_size[0] || + !params->compute_group_size[1]; + if (!sh_try_compute(sh, bw, bh, flex, params->compute_shmem)) + return false; + } + + if (!sh_require(sh, params->input, params->output_w, params->output_h)) + return false; + + sh->output = params->output; + + for (int i = 0; i < params->num_variables; i++) { + struct pl_shader_var sv = params->variables[i]; + GLSLP("#define %s "$"\n", sv.var.name, sh_var(sh, sv)); + } + + for (int i = 0; i < params->num_descriptors; i++) { + struct pl_shader_desc sd = params->descriptors[i]; + GLSLP("#define %s "$"\n", sd.desc.name, sh_desc(sh, sd)); + } + + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct pl_shader_va sva = params->vertex_attribs[i]; + GLSLP("#define %s "$"\n", sva.attr.name, sh_attr(sh, sva)); + } + + for (int i = 0; i < params->num_constants; i++) { + struct pl_shader_const sc = params->constants[i]; + GLSLP("#define %s "$"\n", sc.name, sh_const(sh, sc)); + } + + if (params->prelude) + GLSLP("// pl_shader_custom prelude: \n%s\n", params->prelude); + if (params->header) + GLSLH("// pl_shader_custom header: \n%s\n", params->header); + + if (params->description) + sh_describef(sh, "%s", params->description); + + if (params->body) { + const char *output_decl = ""; + if (params->output != params->input) { + switch (params->output) { + case PL_SHADER_SIG_NONE: break; + case PL_SHADER_SIG_COLOR: + output_decl = "vec4 color = vec4(0.0);"; + break; + + case PL_SHADER_SIG_SAMPLER: + pl_unreachable(); + } + } + + GLSL("// pl_shader_custom \n" + "%s \n" + "{ \n" + "%s \n" + "} \n", + output_decl, params->body); + } + + return true; +} diff --git a/src/shaders/custom_mpv.c b/src/shaders/custom_mpv.c new file mode 100644 index 0000000..4ef0817 --- /dev/null +++ b/src/shaders/custom_mpv.c @@ -0,0 +1,1768 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include <limits.h> + +#include "gpu.h" +#include "shaders.h" + +#include <libplacebo/shaders/colorspace.h> +#include <libplacebo/shaders/custom.h> + +// Hard-coded size limits, mainly for convenience (to avoid dynamic memory) +#define SHADER_MAX_HOOKS 16 +#define SHADER_MAX_BINDS 16 +#define MAX_SHEXP_SIZE 32 + +enum shexp_op { + SHEXP_OP_ADD, + SHEXP_OP_SUB, + SHEXP_OP_MUL, + SHEXP_OP_DIV, + SHEXP_OP_MOD, + SHEXP_OP_NOT, + SHEXP_OP_GT, + SHEXP_OP_LT, + SHEXP_OP_EQ, +}; + +enum shexp_tag { + SHEXP_END = 0, // End of an RPN expression + SHEXP_CONST, // Push a constant value onto the stack + SHEXP_TEX_W, // Get the width/height of a named texture (variable) + SHEXP_TEX_H, + SHEXP_OP2, // Pop two elements and push the result of a dyadic operation + SHEXP_OP1, // Pop one element and push the result of a monadic operation + SHEXP_VAR, // Arbitrary variable (e.g. shader parameters) +}; + +struct shexp { + enum shexp_tag tag; + union { + float cval; + pl_str varname; + enum shexp_op op; + } val; +}; + +struct custom_shader_hook { + // Variable/literal names of textures + pl_str pass_desc; + pl_str hook_tex[SHADER_MAX_HOOKS]; + pl_str bind_tex[SHADER_MAX_BINDS]; + pl_str save_tex; + + // Shader body itself + metadata + pl_str pass_body; + float offset[2]; + bool offset_align; + int comps; + + // Special expressions governing the output size and execution conditions + struct shexp width[MAX_SHEXP_SIZE]; + struct shexp height[MAX_SHEXP_SIZE]; + struct shexp cond[MAX_SHEXP_SIZE]; + + // Special metadata for compute shaders + bool is_compute; + int block_w, block_h; // Block size (each block corresponds to one WG) + int threads_w, threads_h; // How many threads form a WG +}; + +static bool parse_rpn_shexpr(pl_str line, struct shexp out[MAX_SHEXP_SIZE]) +{ + int pos = 0; + + while (line.len > 0) { + pl_str word = pl_str_split_char(line, ' ', &line); + if (word.len == 0) + continue; + + if (pos >= MAX_SHEXP_SIZE) + return false; + + struct shexp *exp = &out[pos++]; + + if (pl_str_eatend0(&word, ".w") || pl_str_eatend0(&word, ".width")) { + exp->tag = SHEXP_TEX_W; + exp->val.varname = word; + continue; + } + + if (pl_str_eatend0(&word, ".h") || pl_str_eatend0(&word, ".height")) { + exp->tag = SHEXP_TEX_H; + exp->val.varname = word; + continue; + } + + switch (word.buf[0]) { + case '+': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_ADD; continue; + case '-': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_SUB; continue; + case '*': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MUL; continue; + case '/': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_DIV; continue; + case '%': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MOD; continue; + case '!': exp->tag = SHEXP_OP1; exp->val.op = SHEXP_OP_NOT; continue; + case '>': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_GT; continue; + case '<': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_LT; continue; + case '=': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_EQ; continue; + } + + if (word.buf[0] >= '0' && word.buf[0] <= '9') { + exp->tag = SHEXP_CONST; + if (!pl_str_parse_float(word, &exp->val.cval)) + return false; + continue; + } + + // Treat as generic variable + exp->tag = SHEXP_VAR; + exp->val.varname = word; + } + + return true; +} + +static inline pl_str split_magic(pl_str *body) +{ + pl_str ret = pl_str_split_str0(*body, "//!", body); + if (body->len) { + // Make sure the separator is included in the remainder + body->buf -= 3; + body->len += 3; + } + + return ret; +} + +static bool parse_hook(pl_log log, pl_str *body, struct custom_shader_hook *out) +{ + *out = (struct custom_shader_hook){ + .pass_desc = pl_str0("unknown user shader"), + .width = {{ SHEXP_TEX_W, { .varname = pl_str0("HOOKED") }}}, + .height = {{ SHEXP_TEX_H, { .varname = pl_str0("HOOKED") }}}, + .cond = {{ SHEXP_CONST, { .cval = 1.0 }}}, + }; + + int hook_idx = 0; + int bind_idx = 0; + + // Parse all headers + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + // Check for the presence of the magic line beginning + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + // Parse the supported commands + if (pl_str_eatstart0(&line, "HOOK")) { + if (hook_idx == SHADER_MAX_HOOKS) { + pl_err(log, "Passes may only hook up to %d textures!", + SHADER_MAX_HOOKS); + return false; + } + out->hook_tex[hook_idx++] = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "BIND")) { + if (bind_idx == SHADER_MAX_BINDS) { + pl_err(log, "Passes may only bind up to %d textures!", + SHADER_MAX_BINDS); + return false; + } + out->bind_tex[bind_idx++] = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "SAVE")) { + pl_str save_tex = pl_str_strip(line); + if (pl_str_equals0(save_tex, "HOOKED")) { + // This is a special name that means "overwrite existing" + // texture, which we just signal by not having any `save_tex` + // name set. + out->save_tex = (pl_str) {0}; + } else if (pl_str_equals0(save_tex, "MAIN")) { + // Compatibility alias + out->save_tex = pl_str0("MAINPRESUB"); + } else { + out->save_tex = save_tex; + }; + continue; + } + + if (pl_str_eatstart0(&line, "DESC")) { + out->pass_desc = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "OFFSET")) { + line = pl_str_strip(line); + if (pl_str_equals0(line, "ALIGN")) { + out->offset_align = true; + } else { + if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[0]) || + !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[1]) || + line.len) + { + pl_err(log, "Error while parsing OFFSET!"); + return false; + } + } + continue; + } + + if (pl_str_eatstart0(&line, "WIDTH")) { + if (!parse_rpn_shexpr(line, out->width)) { + pl_err(log, "Error while parsing WIDTH!"); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "HEIGHT")) { + if (!parse_rpn_shexpr(line, out->height)) { + pl_err(log, "Error while parsing HEIGHT!"); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "WHEN")) { + if (!parse_rpn_shexpr(line, out->cond)) { + pl_err(log, "Error while parsing WHEN!"); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "COMPONENTS")) { + if (!pl_str_parse_int(pl_str_strip(line), &out->comps)) { + pl_err(log, "Error parsing COMPONENTS: '%.*s'", PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "COMPUTE")) { + line = pl_str_strip(line); + bool ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_w) && + pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_h); + + line = pl_str_strip(line); + if (ok && line.len) { + ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_w) && + pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_h) && + !line.len; + } else { + out->threads_w = out->block_w; + out->threads_h = out->block_h; + } + + if (!ok) { + pl_err(log, "Error while parsing COMPUTE!"); + return false; + } + + out->is_compute = true; + continue; + } + + // Unknown command type + pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + } + + // The rest of the file up until the next magic line beginning (if any) + // shall be the shader body + out->pass_body = split_magic(body); + + // Sanity checking + if (hook_idx == 0) + pl_warn(log, "Pass has no hooked textures (will be ignored)!"); + + return true; +} + +static bool parse_tex(pl_gpu gpu, void *alloc, pl_str *body, + struct pl_shader_desc *out) +{ + *out = (struct pl_shader_desc) { + .desc = { + .name = "USER_TEX", + .type = PL_DESC_SAMPLED_TEX, + }, + }; + + struct pl_tex_params params = { + .w = 1, .h = 1, .d = 0, + .sampleable = true, + .debug_tag = PL_DEBUG_TAG, + }; + + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + if (pl_str_eatstart0(&line, "TEXTURE")) { + out->desc.name = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "SIZE")) { + line = pl_str_strip(line); + int dims = 0; + int dim[4]; // extra space to catch invalid extra entries + while (line.len && dims < PL_ARRAY_SIZE(dim)) { + if (!pl_str_parse_int(pl_str_split_char(line, ' ', &line), &dim[dims++])) { + PL_ERR(gpu, "Error while parsing SIZE!"); + return false; + } + } + + uint32_t lim = dims == 1 ? gpu->limits.max_tex_1d_dim + : dims == 2 ? gpu->limits.max_tex_2d_dim + : dims == 3 ? gpu->limits.max_tex_3d_dim + : 0; + + // Sanity check against GPU size limits + switch (dims) { + case 3: + params.d = dim[2]; + if (params.d < 1 || params.d > lim) { + PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!", + params.d, lim); + return false; + } + // fall through + case 2: + params.h = dim[1]; + if (params.h < 1 || params.h > lim) { + PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!", + params.h, lim); + return false; + } + // fall through + case 1: + params.w = dim[0]; + if (params.w < 1 || params.w > lim) { + PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!", + params.w, lim); + return false; + } + break; + + default: + PL_ERR(gpu, "Invalid number of texture dimensions!"); + return false; + }; + + // Clear out the superfluous components + if (dims < 3) + params.d = 0; + if (dims < 2) + params.h = 0; + continue; + } + + if (pl_str_eatstart0(&line, "FORMAT")) { + line = pl_str_strip(line); + params.format = NULL; + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + if (pl_str_equals0(line, fmt->name)) { + params.format = fmt; + break; + } + } + + if (!params.format || params.format->opaque) { + PL_ERR(gpu, "Unrecognized/unavailable FORMAT name: '%.*s'!", + PL_STR_FMT(line)); + return false; + } + + if (!(params.format->caps & PL_FMT_CAP_SAMPLEABLE)) { + PL_ERR(gpu, "Chosen FORMAT '%.*s' is not sampleable!", + PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "FILTER")) { + line = pl_str_strip(line); + if (pl_str_equals0(line, "LINEAR")) { + out->binding.sample_mode = PL_TEX_SAMPLE_LINEAR; + } else if (pl_str_equals0(line, "NEAREST")) { + out->binding.sample_mode = PL_TEX_SAMPLE_NEAREST; + } else { + PL_ERR(gpu, "Unrecognized FILTER: '%.*s'!", PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "BORDER")) { + line = pl_str_strip(line); + if (pl_str_equals0(line, "CLAMP")) { + out->binding.address_mode = PL_TEX_ADDRESS_CLAMP; + } else if (pl_str_equals0(line, "REPEAT")) { + out->binding.address_mode = PL_TEX_ADDRESS_REPEAT; + } else if (pl_str_equals0(line, "MIRROR")) { + out->binding.address_mode = PL_TEX_ADDRESS_MIRROR; + } else { + PL_ERR(gpu, "Unrecognized BORDER: '%.*s'!", PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "STORAGE")) { + params.storable = true; + out->desc.type = PL_DESC_STORAGE_IMG; + out->desc.access = PL_DESC_ACCESS_READWRITE; + out->memory = PL_MEMORY_COHERENT; + continue; + } + + PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + } + + if (!params.format) { + PL_ERR(gpu, "No FORMAT specified!"); + return false; + } + + int caps = params.format->caps; + if (out->binding.sample_mode == PL_TEX_SAMPLE_LINEAR && !(caps & PL_FMT_CAP_LINEAR)) { + PL_ERR(gpu, "The specified texture format cannot be linear filtered!"); + return false; + } + + // Decode the rest of the section (up to the next //! marker) as raw hex + // data for the texture + pl_str tex, hexdata = split_magic(body); + if (!pl_str_decode_hex(NULL, pl_str_strip(hexdata), &tex)) { + PL_ERR(gpu, "Error while parsing TEXTURE body: must be a valid " + "hexadecimal sequence!"); + return false; + } + + int texels = params.w * PL_DEF(params.h, 1) * PL_DEF(params.d, 1); + size_t expected_len = texels * params.format->texel_size; + if (tex.len == 0 && params.storable) { + // In this case, it's okay that the texture has no initial data + pl_free_ptr(&tex.buf); + } else if (tex.len != expected_len) { + PL_ERR(gpu, "Shader TEXTURE size mismatch: got %zu bytes, expected %zu!", + tex.len, expected_len); + pl_free(tex.buf); + return false; + } + + params.initial_data = tex.buf; + out->binding.object = pl_tex_create(gpu, ¶ms); + pl_free(tex.buf); + + if (!out->binding.object) { + PL_ERR(gpu, "Failed creating custom texture!"); + return false; + } + + return true; +} + +static bool parse_buf(pl_gpu gpu, void *alloc, pl_str *body, + struct pl_shader_desc *out) +{ + *out = (struct pl_shader_desc) { + .desc = { + .name = "USER_BUF", + .type = PL_DESC_BUF_UNIFORM, + }, + }; + + // Temporary, to allow deferring variable placement until all headers + // have been processed (in order to e.g. determine buffer type) + void *tmp = pl_tmp(alloc); // will be freed automatically on failure + PL_ARRAY(struct pl_var) vars = {0}; + + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + if (pl_str_eatstart0(&line, "BUFFER")) { + out->desc.name = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "STORAGE")) { + out->desc.type = PL_DESC_BUF_STORAGE; + out->desc.access = PL_DESC_ACCESS_READWRITE; + out->memory = PL_MEMORY_COHERENT; + continue; + } + + if (pl_str_eatstart0(&line, "VAR")) { + pl_str type_name = pl_str_split_char(pl_str_strip(line), ' ', &line); + struct pl_var var = {0}; + for (const struct pl_named_var *nv = pl_var_glsl_types; nv->glsl_name; nv++) { + if (pl_str_equals0(type_name, nv->glsl_name)) { + var = nv->var; + break; + } + } + + if (!var.type) { + // No type found + PL_ERR(gpu, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(type_name)); + return false; + } + + pl_str var_name = pl_str_split_char(line, '[', &line); + if (line.len > 0) { + // Parse array dimension + if (!pl_str_parse_int(pl_str_split_char(line, ']', NULL), &var.dim_a)) { + PL_ERR(gpu, "Failed parsing array dimension from [%.*s!", + PL_STR_FMT(line)); + return false; + } + + if (var.dim_a < 1) { + PL_ERR(gpu, "Invalid array dimension %d!", var.dim_a); + return false; + } + } + + var.name = pl_strdup0(alloc, pl_str_strip(var_name)); + PL_ARRAY_APPEND(tmp, vars, var); + continue; + } + + PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + } + + // Try placing all of the buffer variables + for (int i = 0; i < vars.num; i++) { + if (!sh_buf_desc_append(alloc, gpu, out, NULL, vars.elem[i])) { + PL_ERR(gpu, "Custom buffer exceeds GPU limitations!"); + return false; + } + } + + // Decode the rest of the section (up to the next //! marker) as raw hex + // data for the buffer + pl_str data, hexdata = split_magic(body); + if (!pl_str_decode_hex(tmp, pl_str_strip(hexdata), &data)) { + PL_ERR(gpu, "Error while parsing BUFFER body: must be a valid " + "hexadecimal sequence!"); + return false; + } + + size_t buf_size = sh_buf_desc_size(out); + if (data.len == 0 && out->desc.type == PL_DESC_BUF_STORAGE) { + // In this case, it's okay that the buffer has no initial data + } else if (data.len != buf_size) { + PL_ERR(gpu, "Shader BUFFER size mismatch: got %zu bytes, expected %zu!", + data.len, buf_size); + return false; + } + + out->binding.object = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .uniform = out->desc.type == PL_DESC_BUF_UNIFORM, + .storable = out->desc.type == PL_DESC_BUF_STORAGE, + .initial_data = data.len ? data.buf : NULL, + )); + + if (!out->binding.object) { + PL_ERR(gpu, "Failed creating custom buffer!"); + return false; + } + + pl_free(tmp); + return true; +} + +static bool parse_var(pl_log log, pl_str str, enum pl_var_type type, pl_var_data *out) +{ + if (!str.len) + return true; + + pl_str buf = str; + bool ok = false; + switch (type) { + case PL_VAR_SINT: + ok = pl_str_parse_int(pl_str_split_char(buf, ' ', &buf), &out->i); + break; + case PL_VAR_UINT: + ok = pl_str_parse_uint(pl_str_split_char(buf, ' ', &buf), &out->u); + break; + case PL_VAR_FLOAT: + ok = pl_str_parse_float(pl_str_split_char(buf, ' ', &buf), &out->f); + break; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + + if (pl_str_strip(buf).len > 0) + ok = false; // left-over garbage + + if (!ok) { + pl_err(log, "Failed parsing variable data: %.*s", PL_STR_FMT(str)); + return false; + } + + return true; +} + +static bool check_bounds(pl_log log, enum pl_var_type type, const pl_var_data data, + const pl_var_data minimum, const pl_var_data maximum) +{ +#define CHECK_BOUNDS(v, fmt) do \ +{ \ + if (data.v < minimum.v) { \ + pl_err(log, "Initial value "fmt" below declared minimum "fmt"!", \ + data.v, minimum.v); \ + return false; \ + } \ + if (data.v > maximum.v) { \ + pl_err(log, "Initial value "fmt" above declared maximum "fmt"!", \ + data.v, maximum.v); \ + return false; \ + } \ +} while (0) + + switch (type) { + case PL_VAR_SINT: + CHECK_BOUNDS(i, "%d"); + break; + case PL_VAR_UINT: + CHECK_BOUNDS(u, "%u"); + break; + case PL_VAR_FLOAT: + CHECK_BOUNDS(f, "%f"); + break; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + +#undef CHECK_BOUNDS + return true; +} + +static bool parse_param(pl_log log, void *alloc, pl_str *body, + struct pl_hook_par *out) +{ + *out = (struct pl_hook_par) {0}; + pl_str minimum = {0}; + pl_str maximum = {0}; + bool is_enum = false; + + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + if (pl_str_eatstart0(&line, "PARAM")) { + out->name = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "DESC")) { + out->description = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "MINIMUM")) { + minimum = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "MAXIMUM")) { + maximum = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "TYPE")) { + line = pl_str_strip(line); + is_enum = pl_str_eatstart0(&line, "ENUM"); + line = pl_str_strip(line); + if (pl_str_eatstart0(&line, "DYNAMIC")) { + out->mode = PL_HOOK_PAR_DYNAMIC; + } else if (pl_str_eatstart0(&line, "CONSTANT")) { + out->mode = PL_HOOK_PAR_CONSTANT; + } else if (pl_str_eatstart0(&line, "DEFINE")) { + out->mode = PL_HOOK_PAR_DEFINE; + out->type = PL_VAR_SINT; + if (pl_str_strip(line).len > 0) { + pl_err(log, "TYPE DEFINE does not take any extra arguments, " + "unexpected: '%.*s'", PL_STR_FMT(line)); + return false; + } + continue; + } else { + out->mode = PL_HOOK_PAR_VARIABLE; + } + + line = pl_str_strip(line); + for (const struct pl_named_var *nv = pl_var_glsl_types; + nv->glsl_name; nv++) + { + if (pl_str_equals0(line, nv->glsl_name)) { + if (nv->var.dim_v > 1 || nv->var.dim_m > 1) { + pl_err(log, "GLSL type '%s' is incompatible with " + "shader parameters, must be scalar type!", + nv->glsl_name); + return false; + } + + out->type = nv->var.type; + if (is_enum && out->type != PL_VAR_SINT) { + pl_err(log, "ENUM is only compatible with type int/DEFINE!"); + return false; + } + goto next; + } + } + + pl_err(log, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(line)); + return false; + } + + pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + +next: ; + } + + switch (out->type) { + case PL_VAR_INVALID: + pl_err(log, "Missing variable type!"); + return false; + case PL_VAR_SINT: + out->minimum.i = INT_MIN; + out->maximum.i = INT_MAX; + break; + case PL_VAR_UINT: + out->minimum.u = 0; + out->maximum.u = UINT_MAX; + break; + case PL_VAR_FLOAT: + out->minimum.f = -INFINITY; + out->maximum.f = INFINITY; + break; + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + + pl_str initial = pl_str_strip(split_magic(body)); + if (!initial.len) { + pl_err(log, "Missing initial parameter value!"); + return false; + } + + if (is_enum) { + PL_ARRAY(const char *) names = {0}; + pl_assert(out->type == PL_VAR_SINT); + do { + pl_str line = pl_str_strip(pl_str_getline(initial, &initial)); + if (!line.len) + continue; + PL_ARRAY_APPEND(alloc, names, pl_strdup0(alloc, line)); + } while (initial.len); + + pl_assert(names.num >= 1); + out->initial.i = 0; + out->minimum.i = 0; + out->maximum.i = names.num - 1; + out->names = names.elem; + } else { + if (!parse_var(log, initial, out->type, &out->initial)) + return false; + if (!parse_var(log, minimum, out->type, &out->minimum)) + return false; + if (!parse_var(log, maximum, out->type, &out->maximum)) + return false; + if (!check_bounds(log, out->type, out->initial, out->minimum, out->maximum)) + return false; + } + + out->data = pl_memdup(alloc, &out->initial, sizeof(out->initial)); + return true; +} + +static enum pl_hook_stage mp_stage_to_pl(pl_str stage) +{ + if (pl_str_equals0(stage, "RGB")) + return PL_HOOK_RGB_INPUT; + if (pl_str_equals0(stage, "LUMA")) + return PL_HOOK_LUMA_INPUT; + if (pl_str_equals0(stage, "CHROMA")) + return PL_HOOK_CHROMA_INPUT; + if (pl_str_equals0(stage, "ALPHA")) + return PL_HOOK_ALPHA_INPUT; + if (pl_str_equals0(stage, "XYZ")) + return PL_HOOK_XYZ_INPUT; + + if (pl_str_equals0(stage, "CHROMA_SCALED")) + return PL_HOOK_CHROMA_SCALED; + if (pl_str_equals0(stage, "ALPHA_SCALED")) + return PL_HOOK_ALPHA_SCALED; + + if (pl_str_equals0(stage, "NATIVE")) + return PL_HOOK_NATIVE; + if (pl_str_equals0(stage, "MAINPRESUB")) + return PL_HOOK_RGB; + if (pl_str_equals0(stage, "MAIN")) + return PL_HOOK_RGB; // Note: conflicts with above! + + if (pl_str_equals0(stage, "LINEAR")) + return PL_HOOK_LINEAR; + if (pl_str_equals0(stage, "SIGMOID")) + return PL_HOOK_SIGMOID; + if (pl_str_equals0(stage, "PREKERNEL")) + return PL_HOOK_PRE_KERNEL; + if (pl_str_equals0(stage, "POSTKERNEL")) + return PL_HOOK_POST_KERNEL; + + if (pl_str_equals0(stage, "SCALED")) + return PL_HOOK_SCALED; + if (pl_str_equals0(stage, "PREOUTPUT")) + return PL_HOOK_PRE_OUTPUT; + if (pl_str_equals0(stage, "OUTPUT")) + return PL_HOOK_OUTPUT; + + return 0; +} + +static pl_str pl_stage_to_mp(enum pl_hook_stage stage) +{ + switch (stage) { + case PL_HOOK_RGB_INPUT: return pl_str0("RGB"); + case PL_HOOK_LUMA_INPUT: return pl_str0("LUMA"); + case PL_HOOK_CHROMA_INPUT: return pl_str0("CHROMA"); + case PL_HOOK_ALPHA_INPUT: return pl_str0("ALPHA"); + case PL_HOOK_XYZ_INPUT: return pl_str0("XYZ"); + + case PL_HOOK_CHROMA_SCALED: return pl_str0("CHROMA_SCALED"); + case PL_HOOK_ALPHA_SCALED: return pl_str0("ALPHA_SCALED"); + + case PL_HOOK_NATIVE: return pl_str0("NATIVE"); + case PL_HOOK_RGB: return pl_str0("MAINPRESUB"); + + case PL_HOOK_LINEAR: return pl_str0("LINEAR"); + case PL_HOOK_SIGMOID: return pl_str0("SIGMOID"); + case PL_HOOK_PRE_KERNEL: return pl_str0("PREKERNEL"); + case PL_HOOK_POST_KERNEL: return pl_str0("POSTKERNEL"); + + case PL_HOOK_SCALED: return pl_str0("SCALED"); + case PL_HOOK_PRE_OUTPUT: return pl_str0("PREOUTPUT"); + case PL_HOOK_OUTPUT: return pl_str0("OUTPUT"); + }; + + pl_unreachable(); +} + +struct hook_pass { + enum pl_hook_stage exec_stages; + struct custom_shader_hook hook; +}; + +struct pass_tex { + pl_str name; + pl_tex tex; + + // Metadata + pl_rect2df rect; + struct pl_color_repr repr; + struct pl_color_space color; + int comps; +}; + +struct hook_priv { + pl_log log; + pl_gpu gpu; + void *alloc; + + PL_ARRAY(struct hook_pass) hook_passes; + PL_ARRAY(struct pl_hook_par) hook_params; + + // Fixed (for shader-local resources) + PL_ARRAY(struct pl_shader_desc) descriptors; + + // Dynamic per pass + enum pl_hook_stage save_stages; + PL_ARRAY(struct pass_tex) pass_textures; + pl_shader trc_helper; + + // State for PRNG/frame count + int frame_count; + uint64_t prng_state[4]; +}; + +static void hook_reset(void *priv) +{ + struct hook_priv *p = priv; + p->pass_textures.num = 0; +} + +// Context during execution of a hook +struct hook_ctx { + struct hook_priv *priv; + const struct pl_hook_params *params; + struct pass_tex hooked; +}; + +static bool lookup_tex(struct hook_ctx *ctx, pl_str var, float size[2]) +{ + struct hook_priv *p = ctx->priv; + const struct pl_hook_params *params = ctx->params; + + if (pl_str_equals0(var, "HOOKED")) { + pl_assert(ctx->hooked.tex); + size[0] = ctx->hooked.tex->params.w; + size[1] = ctx->hooked.tex->params.h; + return true; + } + + if (pl_str_equals0(var, "NATIVE_CROPPED")) { + size[0] = fabs(pl_rect_w(params->src_rect)); + size[1] = fabs(pl_rect_h(params->src_rect)); + return true; + } + + if (pl_str_equals0(var, "OUTPUT")) { + size[0] = abs(pl_rect_w(params->dst_rect)); + size[1] = abs(pl_rect_h(params->dst_rect)); + return true; + } + + if (pl_str_equals0(var, "MAIN")) + var = pl_str0("MAINPRESUB"); + + for (int i = 0; i < p->pass_textures.num; i++) { + if (pl_str_equals(var, p->pass_textures.elem[i].name)) { + pl_tex tex = p->pass_textures.elem[i].tex; + size[0] = tex->params.w; + size[1] = tex->params.h; + return true; + } + } + + return false; +} + +static bool lookup_var(struct hook_ctx *ctx, pl_str var, float *val) +{ + struct hook_priv *p = ctx->priv; + for (int i = 0; i < p->hook_params.num; i++) { + const struct pl_hook_par *hp = &p->hook_params.elem[i]; + if (pl_str_equals0(var, hp->name)) { + switch (hp->type) { + case PL_VAR_SINT: *val = hp->data->i; return true; + case PL_VAR_UINT: *val = hp->data->u; return true; + case PL_VAR_FLOAT: *val = hp->data->f; return true; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + break; + } + + pl_unreachable(); + } + + if (hp->names) { + for (int j = hp->minimum.i; j <= hp->maximum.i; j++) { + if (pl_str_equals0(var, hp->names[j])) { + *val = j; + return true; + } + } + } + } + + PL_WARN(p, "Variable '%.*s' not found in RPN expression!", PL_STR_FMT(var)); + return false; +} + +// Returns whether successful. 'result' is left untouched on failure +static bool eval_shexpr(struct hook_ctx *ctx, + const struct shexp expr[MAX_SHEXP_SIZE], + float *result) +{ + struct hook_priv *p = ctx->priv; + float stack[MAX_SHEXP_SIZE] = {0}; + int idx = 0; // points to next element to push + + for (int i = 0; i < MAX_SHEXP_SIZE; i++) { + switch (expr[i].tag) { + case SHEXP_END: + goto done; + + case SHEXP_CONST: + // Since our SHEXPs are bound by MAX_SHEXP_SIZE, it should be + // impossible to overflow the stack + assert(idx < MAX_SHEXP_SIZE); + stack[idx++] = expr[i].val.cval; + continue; + + case SHEXP_OP1: + if (idx < 1) { + PL_WARN(p, "Stack underflow in RPN expression!"); + return false; + } + + switch (expr[i].val.op) { + case SHEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break; + default: pl_unreachable(); + } + continue; + + case SHEXP_OP2: + if (idx < 2) { + PL_WARN(p, "Stack underflow in RPN expression!"); + return false; + } + + // Pop the operands in reverse order + float op2 = stack[--idx]; + float op1 = stack[--idx]; + float res = 0.0; + switch (expr[i].val.op) { + case SHEXP_OP_ADD: res = op1 + op2; break; + case SHEXP_OP_SUB: res = op1 - op2; break; + case SHEXP_OP_MUL: res = op1 * op2; break; + case SHEXP_OP_DIV: res = op1 / op2; break; + case SHEXP_OP_MOD: res = fmodf(op1, op2); break; + case SHEXP_OP_GT: res = op1 > op2; break; + case SHEXP_OP_LT: res = op1 < op2; break; + case SHEXP_OP_EQ: res = fabsf(op1 - op2) <= 1e-6 * fmaxf(op1, op2); break; + case SHEXP_OP_NOT: pl_unreachable(); + } + + if (!isfinite(res)) { + PL_WARN(p, "Illegal operation in RPN expression!"); + return false; + } + + stack[idx++] = res; + continue; + + case SHEXP_TEX_W: + case SHEXP_TEX_H: { + pl_str name = expr[i].val.varname; + float size[2]; + + if (!lookup_tex(ctx, name, size)) { + PL_WARN(p, "Variable '%.*s' not found in RPN expression!", + PL_STR_FMT(name)); + return false; + } + + stack[idx++] = (expr[i].tag == SHEXP_TEX_W) ? size[0] : size[1]; + continue; + } + + case SHEXP_VAR: { + pl_str name = expr[i].val.varname; + float val; + if (!lookup_var(ctx, name, &val)) + return false; + stack[idx++] = val; + continue; + } + } + } + +done: + // Return the single stack element + if (idx != 1) { + PL_WARN(p, "Malformed stack after RPN expression!"); + return false; + } + + *result = stack[0]; + return true; +} + +static double prng_step(uint64_t s[4]) +{ + const uint64_t result = s[0] + s[3]; + const uint64_t t = s[1] << 17; + + s[2] ^= s[0]; + s[3] ^= s[1]; + s[1] ^= s[2]; + s[0] ^= s[3]; + + s[2] ^= t; + s[3] = (s[3] << 45) | (s[3] >> (64 - 45)); + return (result >> 11) * 0x1.0p-53; +} + +static bool bind_pass_tex(pl_shader sh, pl_str name, + const struct pass_tex *ptex, + const pl_rect2df *rect, + bool hooked, bool mainpresub) +{ + ident_t id, pos, pt; + + // Compatibility with mpv texture binding semantics + id = sh_bind(sh, ptex->tex, PL_TEX_ADDRESS_CLAMP, PL_TEX_SAMPLE_LINEAR, + "hook_tex", rect, &pos, &pt); + if (!id) + return false; + + GLSLH("#define %.*s_raw "$" \n", PL_STR_FMT(name), id); + GLSLH("#define %.*s_pos "$" \n", PL_STR_FMT(name), pos); + GLSLH("#define %.*s_map "$"_map \n", PL_STR_FMT(name), pos); + GLSLH("#define %.*s_size vec2(textureSize("$", 0)) \n", PL_STR_FMT(name), id); + GLSLH("#define %.*s_pt "$" \n", PL_STR_FMT(name), pt); + + float off[2] = { ptex->rect.x0, ptex->rect.y0 }; + GLSLH("#define %.*s_off "$" \n", PL_STR_FMT(name), + sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("offset"), + .data = off, + })); + + struct pl_color_repr repr = ptex->repr; + ident_t scale = SH_FLOAT(pl_color_repr_normalize(&repr)); + GLSLH("#define %.*s_mul "$" \n", PL_STR_FMT(name), scale); + + // Compatibility with mpv + GLSLH("#define %.*s_rot mat2(1.0, 0.0, 0.0, 1.0) \n", PL_STR_FMT(name)); + + // Sampling function boilerplate + GLSLH("#define %.*s_tex(pos) ("$" * vec4(textureLod("$", pos, 0.0))) \n", + PL_STR_FMT(name), scale, id); + GLSLH("#define %.*s_texOff(off) (%.*s_tex("$" + "$" * vec2(off))) \n", + PL_STR_FMT(name), PL_STR_FMT(name), pos, pt); + + bool can_gather = ptex->tex->params.format->gatherable; + if (can_gather) { + GLSLH("#define %.*s_gather(pos, c) ("$" * vec4(textureGather("$", pos, c))) \n", + PL_STR_FMT(name), scale, id); + } + + if (hooked) { + GLSLH("#define HOOKED_raw %.*s_raw \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_pos %.*s_pos \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_size %.*s_size \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_rot %.*s_rot \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_off %.*s_off \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_pt %.*s_pt \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_map %.*s_map \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_mul %.*s_mul \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_tex %.*s_tex \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_texOff %.*s_texOff \n", PL_STR_FMT(name)); + if (can_gather) + GLSLH("#define HOOKED_gather %.*s_gather \n", PL_STR_FMT(name)); + } + + if (mainpresub) { + GLSLH("#define MAIN_raw MAINPRESUB_raw \n"); + GLSLH("#define MAIN_pos MAINPRESUB_pos \n"); + GLSLH("#define MAIN_size MAINPRESUB_size \n"); + GLSLH("#define MAIN_rot MAINPRESUB_rot \n"); + GLSLH("#define MAIN_off MAINPRESUB_off \n"); + GLSLH("#define MAIN_pt MAINPRESUB_pt \n"); + GLSLH("#define MAIN_map MAINPRESUB_map \n"); + GLSLH("#define MAIN_mul MAINPRESUB_mul \n"); + GLSLH("#define MAIN_tex MAINPRESUB_tex \n"); + GLSLH("#define MAIN_texOff MAINPRESUB_texOff \n"); + if (can_gather) + GLSLH("#define MAIN_gather MAINPRESUB_gather \n"); + } + + return true; +} + +static void save_pass_tex(struct hook_priv *p, struct pass_tex ptex) +{ + + for (int i = 0; i < p->pass_textures.num; i++) { + if (!pl_str_equals(p->pass_textures.elem[i].name, ptex.name)) + continue; + + p->pass_textures.elem[i] = ptex; + return; + } + + // No texture with this name yet, append new one + PL_ARRAY_APPEND(p->alloc, p->pass_textures, ptex); +} + +static struct pl_hook_res hook_hook(void *priv, const struct pl_hook_params *params) +{ + struct hook_priv *p = priv; + pl_str stage = pl_stage_to_mp(params->stage); + struct pl_hook_res res = {0}; + + pl_shader sh = NULL; + struct hook_ctx ctx = { + .priv = p, + .params = params, + .hooked = { + .name = stage, + .tex = params->tex, + .rect = params->rect, + .repr = params->repr, + .color = params->color, + .comps = params->components, + }, + }; + + // Save the input texture if needed + if (p->save_stages & params->stage) { + PL_TRACE(p, "Saving input texture '%.*s' for binding", + PL_STR_FMT(ctx.hooked.name)); + save_pass_tex(p, ctx.hooked); + } + + for (int n = 0; n < p->hook_passes.num; n++) { + const struct hook_pass *pass = &p->hook_passes.elem[n]; + if (!(pass->exec_stages & params->stage)) + continue; + + const struct custom_shader_hook *hook = &pass->hook; + PL_TRACE(p, "Executing hook pass %d on stage '%.*s': %.*s", + n, PL_STR_FMT(stage), PL_STR_FMT(hook->pass_desc)); + + // Test for execution condition + float run = 0; + if (!eval_shexpr(&ctx, hook->cond, &run)) + goto error; + + if (!run) { + PL_TRACE(p, "Skipping hook due to condition"); + continue; + } + + // Generate a new shader object + sh = pl_dispatch_begin(params->dispatch); + + // Bind all necessary input textures + for (int i = 0; i < PL_ARRAY_SIZE(hook->bind_tex); i++) { + pl_str texname = hook->bind_tex[i]; + if (!texname.len) + break; + + // Convenience alias, to allow writing shaders that are oblivious + // of the exact stage they hooked. This simply translates to + // whatever stage actually fired the hook. + bool hooked = false, mainpresub = false; + if (pl_str_equals0(texname, "HOOKED")) { + // Continue with binding this, under the new name + texname = stage; + hooked = true; + } + + // Compatibility alias, because MAIN and MAINPRESUB mean the same + // thing to libplacebo, but user shaders are still written as + // though they can be different concepts. + if (pl_str_equals0(texname, "MAIN") || + pl_str_equals0(texname, "MAINPRESUB")) + { + texname = pl_str0("MAINPRESUB"); + mainpresub = true; + } + + for (int j = 0; j < p->descriptors.num; j++) { + if (pl_str_equals0(texname, p->descriptors.elem[j].desc.name)) { + // Directly bind this, no need to bother with all the + // `bind_pass_tex` boilerplate + ident_t id = sh_desc(sh, p->descriptors.elem[j]); + GLSLH("#define %.*s "$" \n", PL_STR_FMT(texname), id); + + if (p->descriptors.elem[j].desc.type == PL_DESC_SAMPLED_TEX) { + GLSLH("#define %.*s_tex(pos) (textureLod("$", pos, 0.0)) \n", + PL_STR_FMT(texname), id); + } + goto next_bind; + } + } + + for (int j = 0; j < p->pass_textures.num; j++) { + if (pl_str_equals(texname, p->pass_textures.elem[j].name)) { + // Note: We bind the whole texture, rather than + // hooked.rect, because user shaders in general are not + // designed to handle cropped input textures. + const struct pass_tex *ptex = &p->pass_textures.elem[j]; + pl_rect2df rect = { + 0, 0, ptex->tex->params.w, ptex->tex->params.h, + }; + + if (hook->offset_align && pl_str_equals(texname, stage)) { + float sx = pl_rect_w(ctx.hooked.rect) / pl_rect_w(params->src_rect), + sy = pl_rect_h(ctx.hooked.rect) / pl_rect_h(params->src_rect), + ox = ctx.hooked.rect.x0 - sx * params->src_rect.x0, + oy = ctx.hooked.rect.y0 - sy * params->src_rect.y0; + + PL_TRACE(p, "Aligning plane with ref: %f %f", ox, oy); + pl_rect2df_offset(&rect, ox, oy); + } + + if (!bind_pass_tex(sh, texname, &p->pass_textures.elem[j], + &rect, hooked, mainpresub)) + { + goto error; + } + goto next_bind; + } + } + + // If none of the above matched, this is an unknown texture name, + // so silently ignore this pass to match the mpv behavior + PL_TRACE(p, "Skipping hook due to no texture named '%.*s'.", + PL_STR_FMT(texname)); + pl_dispatch_abort(params->dispatch, &sh); + goto next_pass; + + next_bind: ; // outer 'continue' + } + + // Set up the input variables + p->frame_count++; + GLSLH("#define frame "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_int("frame"), + .data = &p->frame_count, + .dynamic = true, + })); + + float random = prng_step(p->prng_state); + GLSLH("#define random "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("random"), + .data = &random, + .dynamic = true, + })); + + float src_size[2] = { pl_rect_w(params->src_rect), pl_rect_h(params->src_rect) }; + GLSLH("#define input_size "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("input_size"), + .data = src_size, + })); + + float dst_size[2] = { pl_rect_w(params->dst_rect), pl_rect_h(params->dst_rect) }; + GLSLH("#define target_size "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("target_size"), + .data = dst_size, + })); + + float tex_off[2] = { params->src_rect.x0, params->src_rect.y0 }; + GLSLH("#define tex_offset "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tex_offset"), + .data = tex_off, + })); + + // Custom parameters + for (int i = 0; i < p->hook_params.num; i++) { + const struct pl_hook_par *hp = &p->hook_params.elem[i]; + switch (hp->mode) { + case PL_HOOK_PAR_VARIABLE: + case PL_HOOK_PAR_DYNAMIC: + GLSLH("#define %s "$" \n", hp->name, + sh_var(sh, (struct pl_shader_var) { + .var = { + .name = hp->name, + .type = hp->type, + .dim_v = 1, + .dim_m = 1, + .dim_a = 1, + }, + .data = hp->data, + .dynamic = hp->mode == PL_HOOK_PAR_DYNAMIC, + })); + break; + + case PL_HOOK_PAR_CONSTANT: + GLSLH("#define %s "$" \n", hp->name, + sh_const(sh, (struct pl_shader_const) { + .name = hp->name, + .type = hp->type, + .data = hp->data, + .compile_time = true, + })); + break; + + case PL_HOOK_PAR_DEFINE: + GLSLH("#define %s %d \n", hp->name, hp->data->i); + break; + + case PL_HOOK_PAR_MODE_COUNT: + pl_unreachable(); + } + + if (hp->names) { + for (int j = hp->minimum.i; j <= hp->maximum.i; j++) + GLSLH("#define %s %d \n", hp->names[j], j); + } + } + + // Helper sub-shaders + uint64_t sh_id = SH_PARAMS(sh).id; + pl_shader_reset(p->trc_helper, pl_shader_params( + .id = ++sh_id, + .gpu = p->gpu, + )); + pl_shader_linearize(p->trc_helper, params->orig_color); + GLSLH("#define linearize "$" \n", sh_subpass(sh, p->trc_helper)); + + pl_shader_reset(p->trc_helper, pl_shader_params( + .id = ++sh_id, + .gpu = p->gpu, + )); + pl_shader_delinearize(p->trc_helper, params->orig_color); + GLSLH("#define delinearize "$" \n", sh_subpass(sh, p->trc_helper)); + + // Load and run the user shader itself + sh_append_str(sh, SH_BUF_HEADER, hook->pass_body); + sh_describef(sh, "%.*s", PL_STR_FMT(hook->pass_desc)); + + // Resolve output size and create framebuffer + float out_size[2] = {0}; + if (!eval_shexpr(&ctx, hook->width, &out_size[0]) || + !eval_shexpr(&ctx, hook->height, &out_size[1])) + { + goto error; + } + + int out_w = roundf(out_size[0]), + out_h = roundf(out_size[1]); + + if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h)) + goto error; + + // Generate a new texture to store the render result + pl_tex fbo; + fbo = params->get_tex(params->priv, out_w, out_h); + if (!fbo) { + PL_ERR(p, "Failed dispatching hook: `get_tex` callback failed?"); + goto error; + } + + bool ok; + if (hook->is_compute) { + + if (!sh_try_compute(sh, hook->threads_w, hook->threads_h, false, 0) || + !fbo->params.storable) + { + PL_ERR(p, "Failed dispatching COMPUTE shader"); + goto error; + } + + GLSLP("#define out_image "$" \n", sh_desc(sh, (struct pl_shader_desc) { + .binding.object = fbo, + .desc = { + .name = "out_image", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + })); + + sh->output = PL_SHADER_SIG_NONE; + + GLSL("hook(); \n"); + ok = pl_dispatch_compute(params->dispatch, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + // Round up as many blocks as are needed to cover the image + PL_DIV_UP(out_w, hook->block_w), + PL_DIV_UP(out_h, hook->block_h), + 1, + }, + .width = out_w, + .height = out_h, + )); + + } else { + + // Default non-COMPUTE shaders to explicitly use fragment shaders + // only, to avoid breaking things like fwidth() + sh->type = PL_DEF(sh->type, SH_FRAGMENT); + + GLSL("vec4 color = hook(); \n"); + ok = pl_dispatch_finish(params->dispatch, pl_dispatch_params( + .shader = &sh, + .target = fbo, + )); + + } + + if (!ok) + goto error; + + float sx = (float) out_w / ctx.hooked.tex->params.w, + sy = (float) out_h / ctx.hooked.tex->params.h, + x0 = sx * ctx.hooked.rect.x0 + hook->offset[0], + y0 = sy * ctx.hooked.rect.y0 + hook->offset[1]; + + pl_rect2df new_rect = { + x0, + y0, + x0 + sx * pl_rect_w(ctx.hooked.rect), + y0 + sy * pl_rect_h(ctx.hooked.rect), + }; + + if (hook->offset_align) { + float rx = pl_rect_w(new_rect) / pl_rect_w(params->src_rect), + ry = pl_rect_h(new_rect) / pl_rect_h(params->src_rect), + ox = rx * params->src_rect.x0 - sx * ctx.hooked.rect.x0, + oy = ry * params->src_rect.y0 - sy * ctx.hooked.rect.y0; + + pl_rect2df_offset(&new_rect, ox, oy); + } + + // Save the result of this shader invocation + struct pass_tex ptex = { + .name = hook->save_tex.len ? hook->save_tex : stage, + .tex = fbo, + .repr = ctx.hooked.repr, + .color = ctx.hooked.color, + .comps = PL_DEF(hook->comps, ctx.hooked.comps), + .rect = new_rect, + }; + + // It's assumed that users will correctly normalize the input + pl_color_repr_normalize(&ptex.repr); + + PL_TRACE(p, "Saving output texture '%.*s' from hook execution on '%.*s'", + PL_STR_FMT(ptex.name), PL_STR_FMT(stage)); + + save_pass_tex(p, ptex); + + // Update the result object, unless we saved to a different name + if (pl_str_equals(ptex.name, stage)) { + ctx.hooked = ptex; + res = (struct pl_hook_res) { + .output = PL_HOOK_SIG_TEX, + .tex = fbo, + .repr = ptex.repr, + .color = ptex.color, + .components = ptex.comps, + .rect = new_rect, + }; + } + +next_pass: ; + } + + return res; + +error: + pl_dispatch_abort(params->dispatch, &sh); + return (struct pl_hook_res) { .failed = true }; +} + +const struct pl_hook *pl_mpv_user_shader_parse(pl_gpu gpu, + const char *shader_text, + size_t shader_len) +{ + if (!shader_len) + return NULL; + + pl_str shader = { (uint8_t *) shader_text, shader_len }; + + struct pl_hook *hook = pl_zalloc_obj(NULL, hook, struct hook_priv); + struct hook_priv *p = PL_PRIV(hook); + + *hook = (struct pl_hook) { + .input = PL_HOOK_SIG_TEX, + .priv = p, + .reset = hook_reset, + .hook = hook_hook, + .signature = pl_str_hash(shader), + }; + + *p = (struct hook_priv) { + .log = gpu->log, + .gpu = gpu, + .alloc = hook, + .trc_helper = pl_shader_alloc(gpu->log, NULL), + .prng_state = { + // Determined by fair die roll + 0xb76d71f9443c228allu, 0x93a02092fc4807e8llu, + 0x06d81748f838bd07llu, 0x9381ee129dddce6cllu, + }, + }; + + shader = pl_strdup(hook, shader); + + // Skip all garbage (e.g. comments) before the first header + int pos = pl_str_find(shader, pl_str0("//!")); + if (pos < 0) { + PL_ERR(gpu, "Shader appears to contain no headers?"); + goto error; + } + shader = pl_str_drop(shader, pos); + + // Loop over the file + while (shader.len > 0) + { + // Peek at the first header to dispatch the right type + if (pl_str_startswith0(shader, "//!TEXTURE")) { + struct pl_shader_desc sd; + if (!parse_tex(gpu, hook, &shader, &sd)) + goto error; + + PL_INFO(gpu, "Registering named texture '%s'", sd.desc.name); + PL_ARRAY_APPEND(hook, p->descriptors, sd); + continue; + } + + if (pl_str_startswith0(shader, "//!BUFFER")) { + struct pl_shader_desc sd; + if (!parse_buf(gpu, hook, &shader, &sd)) + goto error; + + PL_INFO(gpu, "Registering named buffer '%s'", sd.desc.name); + PL_ARRAY_APPEND(hook, p->descriptors, sd); + continue; + } + + if (pl_str_startswith0(shader, "//!PARAM")) { + struct pl_hook_par hp; + if (!parse_param(gpu->log, hook, &shader, &hp)) + goto error; + + PL_INFO(gpu, "Registering named parameter '%s'", hp.name); + PL_ARRAY_APPEND(hook, p->hook_params, hp); + continue; + } + + struct custom_shader_hook h; + if (!parse_hook(gpu->log, &shader, &h)) + goto error; + + struct hook_pass pass = { + .exec_stages = 0, + .hook = h, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(h.hook_tex); i++) + pass.exec_stages |= mp_stage_to_pl(h.hook_tex[i]); + for (int i = 0; i < PL_ARRAY_SIZE(h.bind_tex); i++) { + p->save_stages |= mp_stage_to_pl(h.bind_tex[i]); + if (pl_str_equals0(h.bind_tex[i], "HOOKED")) + p->save_stages |= pass.exec_stages; + } + + // As an extra precaution, this avoids errors when trying to run + // conditions against planes that were never hooked. As a sole + // exception, OUTPUT is special because it's hard-coded to return the + // dst_rect even before it was hooked. (This is an apparently + // undocumented mpv quirk, but shaders rely on it in practice) + enum pl_hook_stage rpn_stages = 0; + for (int i = 0; i < PL_ARRAY_SIZE(h.width); i++) { + if (h.width[i].tag == SHEXP_TEX_W || h.width[i].tag == SHEXP_TEX_H) + rpn_stages |= mp_stage_to_pl(h.width[i].val.varname); + } + for (int i = 0; i < PL_ARRAY_SIZE(h.height); i++) { + if (h.height[i].tag == SHEXP_TEX_W || h.height[i].tag == SHEXP_TEX_H) + rpn_stages |= mp_stage_to_pl(h.height[i].val.varname); + } + for (int i = 0; i < PL_ARRAY_SIZE(h.cond); i++) { + if (h.cond[i].tag == SHEXP_TEX_W || h.cond[i].tag == SHEXP_TEX_H) + rpn_stages |= mp_stage_to_pl(h.cond[i].val.varname); + } + + p->save_stages |= rpn_stages & ~PL_HOOK_OUTPUT; + + PL_INFO(gpu, "Registering hook pass: %.*s", PL_STR_FMT(h.pass_desc)); + PL_ARRAY_APPEND(hook, p->hook_passes, pass); + } + + // We need to hook on both the exec and save stages, so that we can keep + // track of any textures we might need + hook->stages |= p->save_stages; + for (int i = 0; i < p->hook_passes.num; i++) + hook->stages |= p->hook_passes.elem[i].exec_stages; + + hook->parameters = p->hook_params.elem; + hook->num_parameters = p->hook_params.num; + + PL_MSG(gpu, PL_LOG_DEBUG, "Loaded user shader:"); + pl_msg_source(gpu->log, PL_LOG_DEBUG, shader_text); + + return hook; + +error: + pl_mpv_user_shader_destroy((const struct pl_hook **) &hook); + PL_MSG(gpu, PL_LOG_ERR, "Failed to parse user shader:"); + pl_msg_source(gpu->log, PL_LOG_ERR, shader_text); + pl_log_stack_trace(gpu->log, PL_LOG_ERR); + return NULL; +} + +void pl_mpv_user_shader_destroy(const struct pl_hook **hookp) +{ + const struct pl_hook *hook = *hookp; + if (!hook) + return; + + struct hook_priv *p = PL_PRIV(hook); + for (int i = 0; i < p->descriptors.num; i++) { + switch (p->descriptors.elem[i].desc.type) { + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = p->descriptors.elem[i].binding.object; + pl_buf_destroy(p->gpu, &buf); + break; + } + + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: { + pl_tex tex = p->descriptors.elem[i].binding.object; + pl_tex_destroy(p->gpu, &tex); + break; + + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + } + } + + pl_shader_free(&p->trc_helper); + pl_free((void *) hook); + *hookp = NULL; +} diff --git a/src/shaders/deinterlacing.c b/src/shaders/deinterlacing.c new file mode 100644 index 0000000..5c85138 --- /dev/null +++ b/src/shaders/deinterlacing.c @@ -0,0 +1,260 @@ +/* + * This file is part of libplacebo, but also based on vf_yadif_cuda.cu: + * Copyright (C) 2018 Philip Langdale <philipl@overt.org> + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" + +#include <libplacebo/shaders/deinterlacing.h> + +const struct pl_deinterlace_params pl_deinterlace_default_params = { PL_DEINTERLACE_DEFAULTS }; + +void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src, + const struct pl_deinterlace_params *params) +{ + params = PL_DEF(params, &pl_deinterlace_default_params); + + const struct pl_tex_params *texparams = &src->cur.top->params; + if (!sh_require(sh, PL_SHADER_SIG_NONE, texparams->w, texparams->h)) + return; + + sh_describe(sh, "deinterlacing"); + GLSL("vec4 color = vec4(0,0,0,1); \n" + "// pl_shader_deinterlace \n" + "{ \n"); + + uint8_t comp_mask = PL_DEF(src->component_mask, 0xFu); + comp_mask &= (1u << texparams->format->num_components) - 1u; + if (!comp_mask) { + SH_FAIL(sh, "pl_shader_deinterlace: empty component mask?"); + return; + } + + const uint8_t num_comps = sh_num_comps(comp_mask); + const char *swiz = sh_swizzle(comp_mask); + GLSL("#define T %s \n", sh_float_type(comp_mask)); + + ident_t pos, pt; + ident_t cur = sh_bind(sh, src->cur.top, PL_TEX_ADDRESS_MIRROR, + PL_TEX_SAMPLE_NEAREST, "cur", NULL, &pos, &pt); + if (!cur) + return; + + GLSL("#define GET(TEX, X, Y) \\\n" + " (textureLod(TEX, pos + pt * vec2(X, Y), 0.0).%s) \n" + "vec2 pos = "$"; \n" + "vec2 pt = "$"; \n" + "T res; \n", + swiz, pos, pt); + + if (src->field == PL_FIELD_NONE) { + GLSL("res = GET("$", 0, 0); \n", cur); + goto done; + } + + // Don't modify the primary field + GLSL("int yh = textureSize("$", 0).y; \n" + "int yo = int("$".y * float(yh)); \n" + "if (yo %% 2 == %d) { \n" + " res = GET("$", 0, 0); \n" + "} else { \n", + cur, pos, + src->field == PL_FIELD_TOP ? 0 : 1, + cur); + + switch (params->algo) { + case PL_DEINTERLACE_WEAVE: + GLSL("res = GET("$", 0, 0); \n", cur); + break; + + case PL_DEINTERLACE_BOB: + GLSL("res = GET("$", 0, %d); \n", cur, + src->field == PL_FIELD_TOP ? -1 : 1); + break; + + + case PL_DEINTERLACE_YADIF: { + // Try using a compute shader for this, for the sole reason of + // optimizing for thread group synchronicity. Otherwise, because we + // alternate between lines output as-is and lines output deinterlaced, + // half of our thread group will be mostly idle at any point in time. + const int bw = PL_DEF(sh_glsl(sh).subgroup_size, 32); + sh_try_compute(sh, bw, 1, true, 0); + + // This magic constant is hard-coded in the original implementation as + // '1' on an 8-bit scale. Since we work with arbitrary bit depth + // floating point textures, we have to convert this somehow. Hard-code + // it as 1/255 under the assumption that the original intent was to be + // roughly 1 unit of brightness increment on an 8-bit source. This may + // or may not produce suboptimal results on higher-bit-depth content. + static const float spatial_bias = 1 / 255.0f; + + // Calculate spatial prediction + ident_t spatial_pred = sh_fresh(sh, "spatial_predictor"); + GLSLH("float "$"(float a, float b, float c, float d, float e, float f, float g, \n" + " float h, float i, float j, float k, float l, float m, float n) \n" + "{ \n" + " float spatial_pred = (d + k) / 2.0; \n" + " float spatial_score = abs(c - j) + abs(d - k) + abs(e - l) - %f; \n" + + " float score = abs(b - k) + abs(c - l) + abs(d - m); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (c + l) / 2.0; \n" + " spatial_score = score; \n" + " score = abs(a - l) + abs(b - m) + abs(c - n); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (b + m) / 2.0; \n" + " spatial_score = score; \n" + " } \n" + " } \n" + " score = abs(d - i) + abs(e - j) + abs(f - k); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (e + j) / 2.0; \n" + " spatial_score = score; \n" + " score = abs(e - h) + abs(f - i) + abs(g - j); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (f + i) / 2.0; \n" + " spatial_score = score; \n" + " } \n" + " } \n" + " return spatial_pred; \n" + "} \n", + spatial_pred, spatial_bias); + + GLSL("T a = GET("$", -3, -1); \n" + "T b = GET("$", -2, -1); \n" + "T c = GET("$", -1, -1); \n" + "T d = GET("$", 0, -1); \n" + "T e = GET("$", +1, -1); \n" + "T f = GET("$", +2, -1); \n" + "T g = GET("$", +3, -1); \n" + "T h = GET("$", -3, +1); \n" + "T i = GET("$", -2, +1); \n" + "T j = GET("$", -1, +1); \n" + "T k = GET("$", 0, +1); \n" + "T l = GET("$", +1, +1); \n" + "T m = GET("$", +2, +1); \n" + "T n = GET("$", +3, +1); \n", + cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur); + + if (num_comps == 1) { + GLSL("res = "$"(a, b, c, d, e, f, g, h, i, j, k, l, m, n); \n", spatial_pred); + } else { + for (uint8_t i = 0; i < num_comps; i++) { + char c = "xyzw"[i]; + GLSL("res.%c = "$"(a.%c, b.%c, c.%c, d.%c, e.%c, f.%c, g.%c, \n" + " h.%c, i.%c, j.%c, k.%c, l.%c, m.%c, n.%c); \n", + c, spatial_pred, c, c, c, c, c, c, c, c, c, c, c, c, c, c); + } + } + + // Calculate temporal prediction + ident_t temporal_pred = sh_fresh(sh, "temporal_predictor"); + GLSLH("float "$"(float A, float B, float C, float D, float E, float F, \n" + " float G, float H, float I, float J, float K, float L, \n" + " float spatial_pred) \n" + "{ \n" + " float p0 = (C + H) / 2.0; \n" + " float p1 = F; \n" + " float p2 = (D + I) / 2.0; \n" + " float p3 = G; \n" + " float p4 = (E + J) / 2.0; \n" + + " float tdiff0 = abs(D - I) / 2.0; \n" + " float tdiff1 = (abs(A - F) + abs(B - G)) / 2.0; \n" + " float tdiff2 = (abs(K - F) + abs(G - L)) / 2.0; \n" + " float diff = max(tdiff0, max(tdiff1, tdiff2)); \n", + temporal_pred); + if (!params->skip_spatial_check) { + GLSLH("float maxi = max(p2 - min(p3, p1), min(p0 - p1, p4 - p3)); \n" + "float mini = min(p2 - max(p3, p1), max(p0 - p1, p4 - p3)); \n" + "diff = max(diff, max(mini, -maxi)); \n"); + } + GLSLH(" if (spatial_pred > p2 + diff) \n" + " spatial_pred = p2 + diff; \n" + " if (spatial_pred < p2 - diff) \n" + " spatial_pred = p2 - diff; \n" + " return spatial_pred; \n" + "} \n"); + + ident_t prev2 = cur, next2 = cur; + if (src->prev.top && src->prev.top != src->cur.top) { + pl_assert(src->prev.top->params.w == texparams->w); + pl_assert(src->prev.top->params.h == texparams->h); + prev2 = sh_bind(sh, src->prev.top, PL_TEX_ADDRESS_MIRROR, + PL_TEX_SAMPLE_NEAREST, "prev", NULL, NULL, NULL); + if (!prev2) + return; + } + + if (src->next.top && src->next.top != src->cur.top) { + pl_assert(src->next.top->params.w == texparams->w); + pl_assert(src->next.top->params.h == texparams->h); + next2 = sh_bind(sh, src->next.top, PL_TEX_ADDRESS_MIRROR, + PL_TEX_SAMPLE_NEAREST, "next", NULL, NULL, NULL); + if (!next2) + return; + } + + enum pl_field first_field = PL_DEF(src->first_field, PL_FIELD_TOP); + ident_t prev1 = src->field == first_field ? prev2 : cur; + ident_t next1 = src->field == first_field ? cur : next2; + + GLSL("T A = GET("$", 0, -1); \n" + "T B = GET("$", 0, 1); \n" + "T C = GET("$", 0, -2); \n" + "T D = GET("$", 0, 0); \n" + "T E = GET("$", 0, +2); \n" + "T F = GET("$", 0, -1); \n" + "T G = GET("$", 0, +1); \n" + "T H = GET("$", 0, -2); \n" + "T I = GET("$", 0, 0); \n" + "T J = GET("$", 0, +2); \n" + "T K = GET("$", 0, -1); \n" + "T L = GET("$", 0, +1); \n", + prev2, prev2, + prev1, prev1, prev1, + cur, cur, + next1, next1, next1, + next2, next2); + + if (num_comps == 1) { + GLSL("res = "$"(A, B, C, D, E, F, G, H, I, J, K, L, res); \n", temporal_pred); + } else { + for (uint8_t i = 0; i < num_comps; i++) { + char c = "xyzw"[i]; + GLSL("res.%c = "$"(A.%c, B.%c, C.%c, D.%c, E.%c, F.%c, \n" + " G.%c, H.%c, I.%c, J.%c, K.%c, L.%c, \n" + " res.%c); \n", + c, temporal_pred, c, c, c, c, c, c, c, c, c, c, c, c, c); + } + } + break; + } + + case PL_DEINTERLACE_ALGORITHM_COUNT: + pl_unreachable(); + } + + GLSL("}\n"); // End of primary/secondary field branch + +done: + GLSL("color.%s = res; \n" + "#undef T \n" + "#undef GET \n" + "} \n", + swiz); +} diff --git a/src/shaders/dithering.c b/src/shaders/dithering.c new file mode 100644 index 0000000..4485d11 --- /dev/null +++ b/src/shaders/dithering.c @@ -0,0 +1,527 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include "shaders.h" + +#include <libplacebo/shaders/dithering.h> + +const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS }; + +struct sh_dither_obj { + pl_shader_obj lut; +}; + +static void sh_dither_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_dither_obj *obj = ptr; + pl_shader_obj_destroy(&obj->lut); + *obj = (struct sh_dither_obj) {0}; +} + +static void fill_dither_matrix(void *data, const struct sh_lut_params *params) +{ + pl_assert(params->width > 0 && params->height > 0 && params->comps == 1); + + const struct pl_dither_params *dpar = params->priv; + switch (dpar->method) { + case PL_DITHER_ORDERED_LUT: + pl_assert(params->width == params->height); + pl_generate_bayer_matrix(data, params->width); + return; + + case PL_DITHER_BLUE_NOISE: + pl_assert(params->width == params->height); + pl_generate_blue_noise(data, params->width); + return; + + case PL_DITHER_ORDERED_FIXED: + case PL_DITHER_WHITE_NOISE: + case PL_DITHER_METHOD_COUNT: + return; + } + + pl_unreachable(); +} + +static bool dither_method_is_lut(enum pl_dither_method method) +{ + switch (method) { + case PL_DITHER_BLUE_NOISE: + case PL_DITHER_ORDERED_LUT: + return true; + case PL_DITHER_ORDERED_FIXED: + case PL_DITHER_WHITE_NOISE: + return false; + case PL_DITHER_METHOD_COUNT: + break; + } + + pl_unreachable(); +} + +static inline float approx_gamma(enum pl_color_transfer trc) +{ + switch (trc) { + case PL_COLOR_TRC_UNKNOWN: return 1.0f; + case PL_COLOR_TRC_LINEAR: return 1.0f; + case PL_COLOR_TRC_PRO_PHOTO:return 1.8f; + case PL_COLOR_TRC_GAMMA18: return 1.8f; + case PL_COLOR_TRC_GAMMA20: return 2.0f; + case PL_COLOR_TRC_GAMMA24: return 2.4f; + case PL_COLOR_TRC_GAMMA26: return 2.6f; + case PL_COLOR_TRC_ST428: return 2.6f; + case PL_COLOR_TRC_GAMMA28: return 2.8f; + + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_GAMMA22: + return 2.2f; + + case PL_COLOR_TRC_PQ: + case PL_COLOR_TRC_HLG: + case PL_COLOR_TRC_V_LOG: + case PL_COLOR_TRC_S_LOG1: + case PL_COLOR_TRC_S_LOG2: + return 2.0f; // TODO: handle this better + + case PL_COLOR_TRC_COUNT: break; + } + + pl_unreachable(); +} + +void pl_shader_dither(pl_shader sh, int new_depth, + pl_shader_obj *dither_state, + const struct pl_dither_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + if (new_depth <= 0 || new_depth > 256) { + PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth); + return; + } + + sh_describef(sh, "dithering (%d bits)", new_depth); + GLSL("// pl_shader_dither \n" + "{ \n" + "float bias; \n"); + + params = PL_DEF(params, &pl_dither_default_params); + if (params->lut_size < 0 || params->lut_size > 8) { + SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size); + return; + } + + enum pl_dither_method method = params->method; + ident_t lut = NULL_IDENT; + int lut_size = 0; + + if (dither_method_is_lut(method)) { + if (!dither_state) { + PL_WARN(sh, "LUT-based dither method specified but no dither state " + "object given, falling back to non-LUT based methods."); + goto fallback; + } + + struct sh_dither_obj *obj; + obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER, + struct sh_dither_obj, sh_dither_uninit); + if (!obj) + goto fallback; + + bool cache = method == PL_DITHER_BLUE_NOISE; + lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size); + lut = sh_lut(sh, sh_lut_params( + .object = &obj->lut, + .var_type = PL_VAR_FLOAT, + .width = lut_size, + .height = lut_size, + .comps = 1, + .fill = fill_dither_matrix, + .signature = (CACHE_KEY_DITHER ^ method) * lut_size, + .cache = cache ? SH_CACHE(sh) : NULL, + .priv = (void *) params, + )); + if (!lut) + goto fallback; + } + + goto done; + +fallback: + method = PL_DITHER_ORDERED_FIXED; + // fall through + +done: ; + + int size = 0; + if (lut) { + size = lut_size; + } else if (method == PL_DITHER_ORDERED_FIXED) { + size = 16; // hard-coded size + } + + if (size) { + // Transform the screen position to the cyclic range [0,1) + GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size)); + + if (params->temporal) { + int phase = SH_PARAMS(sh).index % 8; + float r = phase * (M_PI / 2); // rotate + float m = phase < 4 ? 1 : -1; // mirror + float mat[2][2] = { + {cos(r), -sin(r) }, + {sin(r) * m, cos(r) * m}, + }; + + ident_t rot = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat2("dither_rot"), + .data = &mat[0][0], + .dynamic = true, + }); + GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot); + } + } + + switch (method) { + case PL_DITHER_WHITE_NOISE: { + ident_t prng = sh_prng(sh, params->temporal, NULL); + GLSL("bias = "$".x;\n", prng); + break; + } + + case PL_DITHER_ORDERED_FIXED: + // Bitwise ordered dither using only 32-bit uints + GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u; \n" + // Bitwise merge (morton number) + "xy.x = xy.x ^ xy.y; \n" + "xy = (xy | xy << 2) & uvec2(0x33333333); \n" + "xy = (xy | xy << 1) & uvec2(0x55555555); \n" + // Bitwise inversion + "uint b = xy.x + (xy.y << 1); \n" + "b = (b * 0x0802u & 0x22110u) | \n" + " (b * 0x8020u & 0x88440u); \n" + "b = 0x10101u * b; \n" + "b = (b >> 16) & 0xFFu; \n" + // Generate bias value + "bias = float(b) * 1.0/256.0; \n"); + break; + + case PL_DITHER_BLUE_NOISE: + case PL_DITHER_ORDERED_LUT: + pl_assert(lut); + GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size)); + break; + + case PL_DITHER_METHOD_COUNT: + pl_unreachable(); + } + + // Scale factor for dither rounding + GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1); + + const float gamma = approx_gamma(params->transfer); + if (gamma != 1.0f && new_depth <= 4) { + GLSL("const float gamma = "$"; \n" + "vec4 color_lin = pow(color, vec4(gamma)); \n", + SH_FLOAT(gamma)); + + if (new_depth == 1) { + // Special case for bit depth 1 dithering, in this case we can just + // ignore the low/high rounding because we know we are always + // dithering between 0.0 and 1.0. + GLSL("const vec4 low = vec4(0.0); \n" + "const vec4 high = vec4(1.0); \n" + "vec4 offset = color_lin; \n"); + } else { + // Linearize the low, high and current color values + GLSL("vec4 low = floor(color * scale) / scale; \n" + "vec4 high = ceil(color * scale) / scale; \n" + "vec4 low_lin = pow(low, vec4(gamma)); \n" + "vec4 high_lin = pow(high, vec4(gamma)); \n" + "vec4 range = high_lin - low_lin; \n" + "vec4 offset = (color_lin - low_lin) / \n" + " max(range, 1e-6); \n"); + } + + // Mix in the correct ratio corresponding to the offset and bias + GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n"); + } else { + // Approximate each gamma segment as a straight line, this simplifies + // the process of dithering down to a single scale and (biased) round. + GLSL("color = scale * color + vec4(bias); \n" + "color = floor(color) * (1.0 / scale); \n"); + } + + GLSL("} \n"); +} + +/* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that +// will be affected by the current column. +static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k) +{ + int ret = 0; + for (int y = 0; y <= PL_EDF_MAX_DY; y++) { + for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { + if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) { + int shifted_x = x + y * k->shift; + + // The shift mapping guarantees current column (or left of it) + // won't be affected by error diffusion. + assert(shifted_x > 0); + + ret = PL_MAX(ret, shifted_x); + } + } + } + return ret; +} + +size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel, + int height) +{ + // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors + // propagated out from bottom side. + int rows = height + PL_EDF_MAX_DY; + int shifted_columns = compute_rightmost_shifted_column(kernel) + 1; + + // The shared memory is an array of size rows*shifted_columns. Each element + // is a single uint for three RGB component. + return rows * shifted_columns * sizeof(uint32_t); +} + +bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params) +{ + const int width = params->input_tex->params.w, height = params->input_tex->params.h; + const struct pl_glsl_version glsl = sh_glsl(sh); + const struct pl_error_diffusion_kernel *kernel = + PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite); + + pl_assert(params->output_tex->params.w == width); + pl_assert(params->output_tex->params.h == height); + if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height)) + return false; + + if (params->new_depth <= 0 || params->new_depth > 256) { + PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth); + return false; + } + + // The parallel error diffusion works by applying the shift mapping first. + // Taking the Floyd and Steinberg algorithm for example. After applying + // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are + // propagated into the next few columns, which makes parallel processing on + // the same column possible. + // + // X 7/16 X 7/16 + // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16 + + // Figuring out the size of rectangle containing all shifted pixels. + // The rectangle height is not changed. + int shifted_width = width + (height - 1) * kernel->shift; + + // We process all pixels from the shifted rectangles column by column, with + // a single global work group of size |block_size|. + // Figuring out how many block are required to process all pixels. We need + // this explicitly to make the number of barrier() calls match. + int block_size = PL_MIN(glsl.max_group_threads, height); + int blocks = PL_DIV_UP(height * shifted_width, block_size); + + // If we figure out how many of the next columns will be affected while the + // current columns is being processed. We can store errors of only a few + // columns in the shared memory. Using a ring buffer will further save the + // cost while iterating to next column. + // + int ring_buffer_rows = height + PL_EDF_MAX_DY; + int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1; + ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_UINT, + .name = "ring_buffer_size", + .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns }, + .compile_time = true, + }); + + // Compute shared memory requirements and try enabling compute shader. + size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t); + if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) { + PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or " + "insufficient compute shader memory!"); + return false; + } + + ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->input_tex, + .desc = { + .name = "input_tex", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + ident_t out_img = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->output_tex, + .desc = { + .name = "output_tex", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + }); + + sh->output = PL_SHADER_SIG_NONE; + sh_describef(sh, "error diffusion (%s, %d bits)", + kernel->name, params->new_depth); + + // Defines the ring buffer in shared memory. + GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size); + GLSL("// pl_shader_error_diffusion \n" + // Safeguard against accidental over-execution + "if (gl_WorkGroupID != uvec3(0)) \n" + " return; \n" + // Initialize the ring buffer. + "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n" + " err_rgb8[i] = 0u; \n" + + // Main block loop, add barrier here to have previous block all + // processed before starting the processing of the next. + "for (uint block_id = 0; block_id < "$"; block_id++) { \n" + "barrier(); \n" + // Compute the coordinate of the pixel we are currently processing, + // both before and after the shift mapping. + "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex; \n" + "const uint height = "$"; \n" + "int y = int(id %% height), x_shifted = int(id / height); \n" + "int x = x_shifted - y * %d; \n" + // Proceed only if we are processing a valid pixel. + "if (x >= 0 && x < "$") { \n" + // The index that the current pixel have on the ring buffer. + "uint idx = uint(x_shifted * "$" + y) %% "$"; \n" + // Fetch the current pixel. + "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0); \n" + "vec3 pix = pix_orig.rgb; \n", + ring_buffer_size, + SH_UINT(blocks), + SH_UINT(height), + kernel->shift, + SH_INT(width), + SH_INT(ring_buffer_rows), + ring_buffer_size, + in_tex); + + // The dithering will quantize pixel value into multiples of 1/dither_quant. + int dither_quant = (1 << params->new_depth) - 1; + + // We encode errors in RGB components into a single 32-bit unsigned integer. + // The error we propagate from the current pixel is in range of + // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the + // sum of all errors been propagated into a pixel is also in the same range. + // It's possible to map errors in this range into [-127, 127], and use an + // unsigned 8-bit integer to store it (using standard two's complement). + // The three 8-bit unsigned integers can then be encoded into a single + // 32-bit unsigned integer, with two 4-bit padding to prevent addition + // operation overflows affecting other component. There are at most 12 + // addition operations on each pixel, so 4-bit padding should be enough. + // The overflow from R component will be discarded. + // + // The following figure is how the encoding looks like. + // + // +------------------------------------+ + // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB| + // +------------------------------------+ + // + + // The bitshift position for R and G component. + const int bitshift_r = 24, bitshift_g = 12; + // The multiplier we use to map [-0.5, 0.5] to [-127, 127]. + const int uint8_mul = 127 * 2; + + GLSL(// Add the error previously propagated into current pixel, and clear + // it in the ring buffer. + "uint err_u32 = err_rgb8[idx] + %uu; \n" + "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128, \n" + " int((err_u32 >> %d) & 0xFFu) - 128, \n" + " int( err_u32 & 0xFFu) - 128) / %d.0; \n" + "err_rgb8[idx] = 0u; \n" + // Write the dithered pixel. + "vec3 dithered = round(pix); \n" + "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a)); \n" + // Prepare for error propagation pass + "vec3 err_divided = (pix - dithered) * %d.0 / %d.0; \n" + "ivec3 tmp; \n", + (128u << bitshift_r) | (128u << bitshift_g) | 128u, + dither_quant, bitshift_r, bitshift_g, uint8_mul, + out_img, dither_quant, + uint8_mul, kernel->divisor); + + // Group error propagation with same weight factor together, in order to + // reduce the number of annoying error encoding. + for (int dividend = 1; dividend <= kernel->divisor; dividend++) { + bool err_assigned = false; + + for (int y = 0; y <= PL_EDF_MAX_DY; y++) { + for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { + if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend) + continue; + + if (!err_assigned) { + err_assigned = true; + + GLSL("tmp = ivec3(round(err_divided * %d.0)); \n" + "err_u32 = (uint(tmp.r & 0xFF) << %d) | \n" + " (uint(tmp.g & 0xFF) << %d) | \n" + " uint(tmp.b & 0xFF); \n", + dividend, + bitshift_r, bitshift_g); + } + + int shifted_x = x + y * kernel->shift; + + // Unlike the right border, errors propagated out from left + // border will remain in the ring buffer. This will produce + // visible artifacts near the left border, especially for + // shift=3 kernels. + if (x < 0) + GLSL("if (x >= %d) \n", -x); + + // Calculate the new position in the ring buffer to propagate + // the error into. + int ring_buffer_delta = shifted_x * ring_buffer_rows + y; + GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n", + ring_buffer_delta, ring_buffer_size); + } + } + } + + GLSL("}} \n"); // end of main loop + valid pixel conditional + return true; +} diff --git a/src/shaders/film_grain.c b/src/shaders/film_grain.c new file mode 100644 index 0000000..b1d25ff --- /dev/null +++ b/src/shaders/film_grain.c @@ -0,0 +1,65 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" +#include "shaders/film_grain.h" + +bool pl_needs_film_grain(const struct pl_film_grain_params *params) +{ + switch (params->data.type) { + case PL_FILM_GRAIN_NONE: return false; + case PL_FILM_GRAIN_AV1: return pl_needs_fg_av1(params); + case PL_FILM_GRAIN_H274: return pl_needs_fg_h274(params); + default: pl_unreachable(); + } +} + +struct sh_grain_obj { + pl_shader_obj av1; + pl_shader_obj h274; +}; + +static void sh_grain_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_grain_obj *obj = ptr; + pl_shader_obj_destroy(&obj->av1); + pl_shader_obj_destroy(&obj->h274); +} + +bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state, + const struct pl_film_grain_params *params) +{ + if (!pl_needs_film_grain(params)) { + // FIXME: Instead of erroring, sample directly + SH_FAIL(sh, "pl_shader_film_grain called but no film grain needs to be " + "applied, test with `pl_needs_film_grain` first!"); + return false; + } + + struct sh_grain_obj *obj; + obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_FILM_GRAIN, + struct sh_grain_obj, sh_grain_uninit); + if (!obj) + return false; + + switch (params->data.type) { + case PL_FILM_GRAIN_NONE: return false; + case PL_FILM_GRAIN_AV1: return pl_shader_fg_av1(sh, &obj->av1, params); + case PL_FILM_GRAIN_H274: return pl_shader_fg_h274(sh, &obj->h274, params); + default: pl_unreachable(); + } +} diff --git a/src/shaders/film_grain.h b/src/shaders/film_grain.h new file mode 100644 index 0000000..f6498c1 --- /dev/null +++ b/src/shaders/film_grain.h @@ -0,0 +1,75 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +#include <libplacebo/shaders/film_grain.h> + +bool pl_needs_fg_av1(const struct pl_film_grain_params *); +bool pl_needs_fg_h274(const struct pl_film_grain_params *); + +bool pl_shader_fg_av1(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *); +bool pl_shader_fg_h274(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *); + +// Common helper function +static inline enum pl_channel channel_map(int i, const struct pl_film_grain_params *params) +{ + static const enum pl_channel map_rgb[3] = { + [PL_CHANNEL_G] = PL_CHANNEL_Y, + [PL_CHANNEL_B] = PL_CHANNEL_CB, + [PL_CHANNEL_R] = PL_CHANNEL_CR, + }; + + static const enum pl_channel map_xyz[3] = { + [1] = PL_CHANNEL_Y, // Y + [2] = PL_CHANNEL_CB, // Z + [0] = PL_CHANNEL_CR, // X + }; + + if (i >= params->components) + return PL_CHANNEL_NONE; + + int comp = params->component_mapping[i]; + if (comp < 0 || comp > 2) + return PL_CHANNEL_NONE; + + switch (params->repr->sys) { + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + return map_rgb[comp]; + case PL_COLOR_SYSTEM_XYZ: + return map_xyz[comp]; + + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_BT_2020_C: + case PL_COLOR_SYSTEM_BT_2100_PQ: + case PL_COLOR_SYSTEM_BT_2100_HLG: + case PL_COLOR_SYSTEM_DOLBYVISION: + case PL_COLOR_SYSTEM_YCGCO: + return comp; + + case PL_COLOR_SYSTEM_COUNT: + break; + } + + pl_unreachable(); +} diff --git a/src/shaders/film_grain_av1.c b/src/shaders/film_grain_av1.c new file mode 100644 index 0000000..3b11ea3 --- /dev/null +++ b/src/shaders/film_grain_av1.c @@ -0,0 +1,1001 @@ +/* + * This file is part of libplacebo, which is normally licensed under the terms + * of the LGPL v2.1+. However, this file (film_grain_av1.c) is also available + * under the terms of the more permissive MIT license: + * + * Copyright (c) 2018-2019 Niklas Haas + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "shaders.h" +#include "shaders/film_grain.h" + +// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512 +static const int16_t gaussian_sequence[2048] = { + 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, + 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, + 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, + -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, + 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, + 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, + 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, + 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, + 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, + 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, + 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, + -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, + 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, + 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, + -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, + -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, + -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, + -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, + 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, + 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, + 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, + -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, + -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, + -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, + 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, + 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, + 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, + -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, + 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, + -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, + 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, + -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, + 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, + -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, + -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, + -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, + -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, + -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, + 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, + 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, + -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, + -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, + 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, + 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, + -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, + 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, + 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, + -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, + 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, + -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, + 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, + -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, + -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, + 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, + -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, + -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, + 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, + 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, + -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, + 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, + 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, + 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, + -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, + -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, + -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, + 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, + -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, + -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, + -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, + -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, + -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, + 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, + -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, + -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, + 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, + -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, + -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, + -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, + 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, + -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, + 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, + 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, + 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, + -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, + -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, + 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, + 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, + -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, + -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, + -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, + -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, + 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, + 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, + 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, + 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, + 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, + 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, + 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, + -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, + 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, + -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, + -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, + -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, + 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, + -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, + -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, + 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, + 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, + 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, + 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, + 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, + 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, + 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, + -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, + -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, + -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, + 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, + -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, + -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, + 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, + -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, + 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, + 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, + 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, + -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, + 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, + -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, + 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, + 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, + 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, + 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, + -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, + -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, + 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, + -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, + 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, + 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, + 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, + -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, + -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, + 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, + 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, + 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, + -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, + -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, + 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, + -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, + -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, + -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, + 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, + -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, + 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, + -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, + 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, + -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, + 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, + 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, + 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, + 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, + -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, + -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, + -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, + -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, + 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, + 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, + 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, + 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, + -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, + 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, + -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, + 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, + 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, + -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, + -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, + -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, + -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, + 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, + -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, + -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, + -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, + -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, + 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, + 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, + -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, + -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, + 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, + 428, -484 +}; + +static inline int get_random_number(int bits, uint16_t *state) +{ + int r = *state; + uint16_t bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; + *state = (r >> 1) | (bit << 15); + + return (*state >> (16 - bits)) & ((1 << bits) - 1); +} + +static inline int round2(int x, int shift) +{ + if (!shift) + return x; + + return (x + (1 << (shift - 1))) >> shift; +} + +enum { + BLOCK_SIZE = 32, + SCALING_LUT_SIZE = 256, + + GRAIN_WIDTH = 82, + GRAIN_HEIGHT = 73, + // On the GPU we only need a subsection of this + GRAIN_WIDTH_LUT = 64, + GRAIN_HEIGHT_LUT = 64, + GRAIN_PAD_LUT = 9, + + // For subsampled grain textures + SUB_GRAIN_WIDTH = 44, + SUB_GRAIN_HEIGHT = 38, + SUB_GRAIN_WIDTH_LUT = GRAIN_WIDTH_LUT >> 1, + SUB_GRAIN_HEIGHT_LUT = GRAIN_HEIGHT_LUT >> 1, + SUB_GRAIN_PAD_LUT = 6, +}; + +// Contains the shift by which the offsets are indexed +enum offset { + OFFSET_TL = 24, + OFFSET_T = 16, + OFFSET_L = 8, + OFFSET_N = 0, +}; + +// Helper function to compute some common constants +struct grain_scale { + int grain_center; + int grain_min; + int grain_max; + float texture_scale; + float grain_scale; +}; + +static inline int bit_depth(const struct pl_color_repr *repr) +{ + int depth = PL_DEF(repr->bits.color_depth, + PL_DEF(repr->bits.sample_depth, 8)); + pl_assert(depth >= 8); + return PL_MIN(depth, 12); +} + +static struct grain_scale get_grain_scale(const struct pl_film_grain_params *params) +{ + int bits = bit_depth(params->repr); + struct grain_scale ret = { + .grain_center = 128 << (bits - 8), + }; + + ret.grain_min = -ret.grain_center; + ret.grain_max = (256 << (bits - 8)) - 1 - ret.grain_center; + + struct pl_color_repr repr = *params->repr; + ret.texture_scale = pl_color_repr_normalize(&repr); + + // Since our color samples are normalized to the range [0, 1], we need to + // scale down grain values from the scale [0, 2^b - 1] to this range. + ret.grain_scale = 1.0 / ((1 << bits) - 1); + + return ret; +} + +// Generates the basic grain table (LumaGrain in the spec). +static void generate_grain_y(float out[GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT], + int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH], + const struct pl_film_grain_params *params) +{ + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + struct grain_scale scale = get_grain_scale(params); + uint16_t seed = (uint16_t) params->data.seed; + int bits = bit_depth(params->repr); + int shift = 12 - bits + data->grain_scale_shift; + pl_assert(shift >= 0); + + for (int y = 0; y < GRAIN_HEIGHT; y++) { + for (int x = 0; x < GRAIN_WIDTH; x++) { + int16_t value = gaussian_sequence[ get_random_number(11, &seed) ]; + buf[y][x] = round2(value, shift); + } + } + + const int ar_pad = 3; + int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { + for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_y; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + if (!dx && !dy) + break; + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max); + buf[y][x] = grain; + } + } + + for (int y = 0; y < GRAIN_HEIGHT_LUT; y++) { + for (int x = 0; x < GRAIN_WIDTH_LUT; x++) { + int16_t grain = buf[y + GRAIN_PAD_LUT][x + GRAIN_PAD_LUT]; + out[y][x] = grain * scale.grain_scale; + } + } +} + +static void generate_grain_uv(float *out, int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH], + const int16_t buf_y[GRAIN_HEIGHT][GRAIN_WIDTH], + enum pl_channel channel, int sub_x, int sub_y, + const struct pl_film_grain_params *params) +{ + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + struct grain_scale scale = get_grain_scale(params); + int bits = bit_depth(params->repr); + int shift = 12 - bits + data->grain_scale_shift; + pl_assert(shift >= 0); + + uint16_t seed = params->data.seed; + if (channel == PL_CHANNEL_CB) { + seed ^= 0xb524; + } else if (channel == PL_CHANNEL_CR) { + seed ^= 0x49d8; + } + + int chromaW = sub_x ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; + int chromaH = sub_y ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; + + const int8_t *coeffs[] = { + [PL_CHANNEL_CB] = data->ar_coeffs_uv[0], + [PL_CHANNEL_CR] = data->ar_coeffs_uv[1], + }; + + for (int y = 0; y < chromaH; y++) { + for (int x = 0; x < chromaW; x++) { + int16_t value = gaussian_sequence[ get_random_number(11, &seed) ]; + buf[y][x] = round2(value, shift); + } + } + + const int ar_pad = 3; + int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < chromaH; y++) { + for (int x = ar_pad; x < chromaW - ar_pad; x++) { + const int8_t *coeff = coeffs[channel]; + pl_assert(coeff); + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + // For the final (current) pixel, we need to add in the + // contribution from the luma grain texture + if (!dx && !dy) { + if (!data->num_points_y) + break; + int luma = 0; + int lumaX = ((x - ar_pad) << sub_x) + ar_pad; + int lumaY = ((y - ar_pad) << sub_y) + ar_pad; + for (int i = 0; i <= sub_y; i++) { + for (int j = 0; j <= sub_x; j++) { + luma += buf_y[lumaY + i][lumaX + j]; + } + } + luma = round2(luma, sub_x + sub_y); + sum += luma * (*coeff); + break; + } + + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max); + buf[y][x] = grain; + } + } + + int lutW = GRAIN_WIDTH_LUT >> sub_x; + int lutH = GRAIN_HEIGHT_LUT >> sub_y; + int padX = sub_x ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT; + int padY = sub_y ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT; + + for (int y = 0; y < lutH; y++) { + for (int x = 0; x < lutW; x++) { + int16_t grain = buf[y + padY][x + padX]; + out[y * lutW + x] = grain * scale.grain_scale; + } + } +} + +static void generate_offsets(void *pbuf, const struct sh_lut_params *params) +{ + const struct pl_film_grain_data *data = params->priv; + unsigned int *buf = pbuf; + pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t)); + + for (int y = 0; y < params->height; y++) { + uint16_t state = data->seed; + state ^= ((y * 37 + 178) & 0xFF) << 8; + state ^= ((y * 173 + 105) & 0xFF); + + for (int x = 0; x < params->width; x++) { + unsigned int *offsets = &buf[y * params->width + x]; + + uint8_t val = get_random_number(8, &state); + uint8_t val_l = x ? (offsets - 1)[0] : 0; + uint8_t val_t = y ? (offsets - params->width)[0] : 0; + uint8_t val_tl = x && y ? (offsets - params->width - 1)[0] : 0; + + // Encode four offsets into a single 32-bit integer for the + // convenience of the GPU. That way only one LUT fetch is + // required for the entire block. + *offsets = ((uint32_t) val_tl << OFFSET_TL) + | ((uint32_t) val_t << OFFSET_T) + | ((uint32_t) val_l << OFFSET_L) + | ((uint32_t) val << OFFSET_N); + } + } +} + +static void generate_scaling(void *pdata, const struct sh_lut_params *params) +{ + assert(params->width == SCALING_LUT_SIZE && params->comps == 1); + float *data = pdata; + + struct { + int num; + uint8_t (*points)[2]; + const struct pl_av1_grain_data *data; + } *ctx = params->priv; + + float range = 1 << ctx->data->scaling_shift; + + // Fill up the preceding entries with the initial value + for (int i = 0; i < ctx->points[0][0]; i++) + data[i] = ctx->points[0][1] / range; + + // Linearly interpolate the values in the middle + for (int i = 0; i < ctx->num - 1; i++) { + int bx = ctx->points[i][0]; + int by = ctx->points[i][1]; + int dx = ctx->points[i + 1][0] - bx; + int dy = ctx->points[i + 1][1] - by; + int delta = dy * ((0x10000 + (dx >> 1)) / dx); + for (int x = 0; x < dx; x++) { + int v = by + ((x * delta + 0x8000) >> 16); + data[bx + x] = v / range; + } + } + + // Fill up the remaining entries with the final value + for (int i = ctx->points[ctx->num - 1][0]; i < SCALING_LUT_SIZE; i++) + data[i] = ctx->points[ctx->num - 1][1] / range; +} + +static void sample(pl_shader sh, enum offset off, ident_t lut, int idx, + int sub_x, int sub_y) +{ + int dx = (off & OFFSET_L) ? 1 : 0, + dy = (off & OFFSET_T) ? 1 : 0; + + static const char *index_strs[] = { + [0] = ".x", + [1] = ".y", + }; + + GLSL("offset = uvec2(%du, %du) * uvec2((data >> %d) & 0xFu, \n" + " (data >> %d) & 0xFu);\n" + "pos = offset + local_id.xy + uvec2(%d, %d); \n" + "val = "$"(pos)%s; \n", + sub_x ? 1 : 2, sub_y ? 1 : 2, off + 4, off, + (BLOCK_SIZE >> sub_x) * dx, + (BLOCK_SIZE >> sub_y) * dy, + lut, idx >= 0 ? index_strs[idx] : ""); +} + +struct grain_obj_av1 { + // LUT objects for the offsets, grain and scaling luts + pl_shader_obj lut_offsets; + pl_shader_obj lut_grain[2]; + pl_shader_obj lut_scaling[3]; + + // Previous parameters used to check reusability + struct pl_film_grain_data data; + struct pl_color_repr repr; + bool fg_has_y; + bool fg_has_u; + bool fg_has_v; + + // Space to store the temporary arrays, reused + uint32_t *offsets; + float grain[2][GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT]; + int16_t grain_tmp_y[GRAIN_HEIGHT][GRAIN_WIDTH]; + int16_t grain_tmp_uv[GRAIN_HEIGHT][GRAIN_WIDTH]; +}; + +static void av1_grain_uninit(pl_gpu gpu, void *ptr) +{ + struct grain_obj_av1 *obj = ptr; + pl_shader_obj_destroy(&obj->lut_offsets); + for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_grain); i++) + pl_shader_obj_destroy(&obj->lut_grain[i]); + for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_scaling); i++) + pl_shader_obj_destroy(&obj->lut_scaling[i]); + *obj = (struct grain_obj_av1) {0}; +} + +bool pl_needs_fg_av1(const struct pl_film_grain_params *params) +{ + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + bool has_y = data->num_points_y > 0; + bool has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma; + bool has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma; + + for (int i = 0; i < 3; i++) { + enum pl_channel channel = channel_map(i, params); + if (channel == PL_CHANNEL_Y && has_y) + return true; + if (channel == PL_CHANNEL_CB && has_u) + return true; + if (channel == PL_CHANNEL_CR && has_v) + return true; + } + + return false; +} + +static inline bool av1_grain_data_eq(const struct pl_film_grain_data *da, + const struct pl_film_grain_data *db) +{ + const struct pl_av1_grain_data *a = &da->params.av1, *b = &db->params.av1; + + // Only check the fields that are relevant for grain LUT generation + return da->seed == db->seed && + a->chroma_scaling_from_luma == b->chroma_scaling_from_luma && + a->scaling_shift == b->scaling_shift && + a->ar_coeff_lag == b->ar_coeff_lag && + a->ar_coeff_shift == b->ar_coeff_shift && + a->grain_scale_shift == b->grain_scale_shift && + !memcmp(a->ar_coeffs_y, b->ar_coeffs_y, sizeof(a->ar_coeffs_y)) && + !memcmp(a->ar_coeffs_uv, b->ar_coeffs_uv, sizeof(a->ar_coeffs_uv)); +} + +static void fill_grain_lut(void *data, const struct sh_lut_params *params) +{ + struct grain_obj_av1 *obj = params->priv; + size_t entries = params->width * params->height * params->comps; + memcpy(data, obj->grain, entries * sizeof(float)); +} + +bool pl_shader_fg_av1(pl_shader sh, pl_shader_obj *grain_state, + const struct pl_film_grain_params *params) +{ + int sub_x = 0, sub_y = 0; + int tex_w = params->tex->params.w, + tex_h = params->tex->params.h; + + if (params->luma_tex) { + sub_x = params->luma_tex->params.w > tex_w; + sub_y = params->luma_tex->params.h > tex_h; + } + + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + bool fg_has_y = data->num_points_y > 0; + bool fg_has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma; + bool fg_has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma; + + bool tex_is_y = false, tex_is_cb = false, tex_is_cr = false; + for (int i = 0; i < 3; i++) { + switch (channel_map(i, params)) { + case PL_CHANNEL_Y: tex_is_y = true; break; + case PL_CHANNEL_CB: tex_is_cb = true; break; + case PL_CHANNEL_CR: tex_is_cr = true; break; + default: break; + }; + } + + if (tex_is_y && (sub_x || sub_y)) { + PL_WARN(sh, "pl_film_grain_params.channels includes PL_CHANNEL_Y but " + "plane is subsampled, this makes no sense. Continuing anyway " + "but output is likely incorrect."); + } + + if (!sh_require(sh, PL_SHADER_SIG_NONE, tex_w, tex_h)) + return false; + + pl_gpu gpu = SH_GPU(sh); + if (!gpu) { + PL_ERR(sh, "AV1 film grain synthesis requires a non-NULL pl_gpu!"); + return false; + } + + // Disable generation for unneeded component types + fg_has_y &= tex_is_y; + fg_has_u &= tex_is_cb; + fg_has_v &= tex_is_cr; + + int bw = BLOCK_SIZE >> sub_x; + int bh = BLOCK_SIZE >> sub_y; + bool is_compute = sh_try_compute(sh, bw, bh, false, sizeof(uint32_t)); + + struct grain_obj_av1 *obj; + obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_AV1_GRAIN, + struct grain_obj_av1, av1_grain_uninit); + if (!obj) + return false; + + // Note: In theory we could check only the parameters related to luma or + // only related to chroma and skip updating for changes to irrelevant + // parts, but this is probably not worth it since the seed is expected to + // change per frame anyway. + bool needs_update = !av1_grain_data_eq(¶ms->data, &obj->data) || + !pl_color_repr_equal(params->repr, &obj->repr) || + fg_has_y != obj->fg_has_y || + fg_has_u != obj->fg_has_u || + fg_has_v != obj->fg_has_v; + + if (needs_update) { + // This is needed even for chroma, so statically generate it + generate_grain_y(obj->grain[0], obj->grain_tmp_y, params); + } + + ident_t lut[3]; + int idx[3] = {-1}; + + if (fg_has_y) { + lut[0] = sh_lut(sh, sh_lut_params( + .object = &obj->lut_grain[0], + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .width = GRAIN_WIDTH_LUT, + .height = GRAIN_HEIGHT_LUT, + .comps = 1, + .update = needs_update, + .dynamic = true, + .fill = fill_grain_lut, + .priv = obj, + )); + + if (!lut[0]) { + SH_FAIL(sh, "Failed generating/uploading luma grain LUT!"); + return false; + } + } + + // Try merging the chroma LUTs into a single texture + int chroma_comps = 0; + if (fg_has_u) { + generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv, + obj->grain_tmp_y, PL_CHANNEL_CB, sub_x, sub_y, + params); + idx[1] = chroma_comps++; + } + if (fg_has_v) { + generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv, + obj->grain_tmp_y, PL_CHANNEL_CR, sub_x, sub_y, + params); + idx[2] = chroma_comps++; + } + + if (chroma_comps > 0) { + lut[1] = lut[2] = sh_lut(sh, sh_lut_params( + .object = &obj->lut_grain[1], + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .width = GRAIN_WIDTH_LUT >> sub_x, + .height = GRAIN_HEIGHT_LUT >> sub_y, + .comps = chroma_comps, + .update = needs_update, + .dynamic = true, + .fill = fill_grain_lut, + .priv = obj, + )); + + if (!lut[1]) { + SH_FAIL(sh, "Failed generating/uploading chroma grain LUT!"); + return false; + } + + if (chroma_comps == 1) + idx[1] = idx[2] = -1; + } + + ident_t offsets = sh_lut(sh, sh_lut_params( + .object = &obj->lut_offsets, + .var_type = PL_VAR_UINT, + .lut_type = SH_LUT_AUTO, + .width = PL_ALIGN2(tex_w << sub_x, 128) / 32, + .height = PL_ALIGN2(tex_h << sub_y, 128) / 32, + .comps = 1, + .update = needs_update, + .dynamic = true, + .fill = generate_offsets, + .priv = (void *) ¶ms->data, + )); + + if (!offsets) { + SH_FAIL(sh, "Failed generating/uploading block offsets LUT!"); + return false; + } + + // For the scaling LUTs, we assume they'll be relatively constant + // throughout the video so doing some extra work to avoid reinitializing + // them constantly is probably worth it. Probably. + const struct pl_av1_grain_data *obj_data = &obj->data.params.av1; + bool scaling_changed = false; + if (fg_has_y || data->chroma_scaling_from_luma) { + scaling_changed |= data->num_points_y != obj_data->num_points_y; + scaling_changed |= memcmp(data->points_y, obj_data->points_y, + sizeof(data->points_y)); + } + + if (fg_has_u && !data->chroma_scaling_from_luma) { + scaling_changed |= data->num_points_uv[0] != obj_data->num_points_uv[0]; + scaling_changed |= memcmp(data->points_uv[0], + obj_data->points_uv[0], + sizeof(data->points_uv[0])); + } + + if (fg_has_v && !data->chroma_scaling_from_luma) { + scaling_changed |= data->num_points_uv[1] != obj_data->num_points_uv[1]; + scaling_changed |= memcmp(data->points_uv[1], + obj_data->points_uv[1], + sizeof(data->points_uv[1])); + } + + ident_t scaling[3] = {0}; + for (int i = 0; i < 3; i++) { + struct { + int num; + const uint8_t (*points)[2]; + const struct pl_av1_grain_data *data; + } priv; + + priv.data = data; + if (i == 0 || data->chroma_scaling_from_luma) { + priv.num = data->num_points_y; + priv.points = &data->points_y[0]; + } else { + priv.num = data->num_points_uv[i - 1]; + priv.points = &data->points_uv[i - 1][0]; + } + + // Skip scaling for unneeded channels + bool has_c[3] = { fg_has_y, fg_has_u, fg_has_v }; + if (has_c[i] && priv.num > 0) { + scaling[i] = sh_lut(sh, sh_lut_params( + .object = &obj->lut_scaling[i], + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_LINEAR, + .width = SCALING_LUT_SIZE, + .comps = 1, + .update = scaling_changed, + .dynamic = true, + .fill = generate_scaling, + .priv = &priv, + )); + + if (!scaling[i]) { + SH_FAIL(sh, "Failed generating/uploading scaling LUTs!"); + return false; + } + } + } + + // Done updating LUTs + obj->data = params->data; + obj->repr = *params->repr; + obj->fg_has_y = fg_has_y; + obj->fg_has_u = fg_has_u; + obj->fg_has_v = fg_has_v; + + sh_describe(sh, "AV1 film grain"); + GLSL("vec4 color; \n" + "// pl_shader_film_grain (AV1) \n" + "{ \n" + "uvec2 offset; \n" + "uvec2 pos; \n" + "float val; \n" + "float grain; \n"); + + if (is_compute) { + GLSL("uvec2 block_id = gl_WorkGroupID.xy; \n" + "uvec2 local_id = gl_LocalInvocationID.xy; \n" + "uvec2 global_id = gl_GlobalInvocationID.xy; \n"); + } else { + GLSL("uvec2 global_id = uvec2(gl_FragCoord); \n" + "uvec2 block_id = global_id / uvec2(%d, %d); \n" + "uvec2 local_id = global_id - uvec2(%d, %d) * block_id; \n", + bw, bh, bw, bh); + } + + // Load the data vector which holds the offsets + if (is_compute) { + ident_t id = sh_fresh(sh, "data"); + GLSLH("shared uint "$"; \n", id); + GLSL("if (gl_LocalInvocationIndex == 0u) \n" + " "$" = uint("$"(block_id)); \n" + "barrier(); \n" + "uint data = "$"; \n", + id, offsets, id); + } else { + GLSL("uint data = uint("$"(block_id)); \n", offsets); + } + + struct grain_scale scale = get_grain_scale(params); + pl_color_repr_normalize(params->repr); + int bits = PL_DEF(params->repr->bits.color_depth, 8); + pl_assert(bits >= 8); + + ident_t minValue, maxLuma, maxChroma; + if (pl_color_levels_guess(params->repr) == PL_COLOR_LEVELS_LIMITED) { + float out_scale = (1 << bits) / ((1 << bits) - 1.0); + minValue = SH_FLOAT(16 / 256.0 * out_scale); + maxLuma = SH_FLOAT(235 / 256.0 * out_scale); + maxChroma = SH_FLOAT(240 / 256.0 * out_scale); + if (!pl_color_system_is_ycbcr_like(params->repr->sys)) + maxChroma = maxLuma; + } else { + minValue = SH_FLOAT(0.0); + maxLuma = SH_FLOAT(1.0); + maxChroma = SH_FLOAT(1.0); + } + + // Load the color value of the tex itself + ident_t tex = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = (struct pl_desc) { + .name = "tex", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + ident_t tex_scale = SH_FLOAT(scale.texture_scale); + GLSL("color = vec4("$") * texelFetch("$", ivec2(global_id), 0); \n", + tex_scale, tex); + + // If we need access to the external luma plane, load it now + if (tex_is_cb || tex_is_cr) { + GLSL("float averageLuma; \n"); + if (tex_is_y) { + // We already have the luma channel as part of the pre-sampled color + for (int i = 0; i < 3; i++) { + if (channel_map(i, params) == PL_CHANNEL_Y) { + GLSL("averageLuma = color["$"]; \n", SH_INT(i)); + break; + } + } + } else { + // Luma channel not present in image, attach it separately + pl_assert(params->luma_tex); + ident_t luma = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->luma_tex, + .desc = (struct pl_desc) { + .name = "luma", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + GLSL("pos = global_id * uvec2(%du, %du); \n" + "averageLuma = texelFetch("$", ivec2(pos), 0)["$"]; \n" + "averageLuma *= "$"; \n", + 1 << sub_x, 1 << sub_y, + luma, SH_INT(params->luma_comp), + tex_scale); + } + } + + ident_t grain_min = SH_FLOAT(scale.grain_min * scale.grain_scale); + ident_t grain_max = SH_FLOAT(scale.grain_max * scale.grain_scale); + + for (int i = 0; i < params->components; i++) { + enum pl_channel c = channel_map(i, params); + if (c == PL_CHANNEL_NONE) + continue; + if (!scaling[c]) + continue; + + sample(sh, OFFSET_N, lut[c], idx[c], sub_x, sub_y); + GLSL("grain = val; \n"); + + if (data->overlap) { + const char *weights[] = { "vec2(27.0, 17.0)", "vec2(23.0, 22.0)" }; + + // X-direction overlapping + GLSL("if (block_id.x > 0u && local_id.x < %du) { \n" + "vec2 w = %s / 32.0; \n" + "if (local_id.x == 1u) w.xy = w.yx; \n", + 2 >> sub_x, weights[sub_x]); + sample(sh, OFFSET_L, lut[c], idx[c], sub_x, sub_y); + GLSL("grain = dot(vec2(val, grain), w); \n" + "} \n"); + + // Y-direction overlapping + GLSL("if (block_id.y > 0u && local_id.y < %du) { \n" + "vec2 w = %s / 32.0; \n" + "if (local_id.y == 1u) w.xy = w.yx; \n", + 2 >> sub_y, weights[sub_y]); + + // We need to special-case the top left pixels since these need to + // pre-blend the top-left offset block before blending vertically + GLSL(" if (block_id.x > 0u && local_id.x < %du) {\n" + " vec2 w2 = %s / 32.0; \n" + " if (local_id.x == 1u) w2.xy = w2.yx; \n", + 2 >> sub_x, weights[sub_x]); + sample(sh, OFFSET_TL, lut[c], idx[c], sub_x, sub_y); + GLSL(" float tmp = val; \n"); + sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y); + GLSL(" val = dot(vec2(tmp, val), w2); \n" + " } else { \n"); + sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y); + GLSL(" } \n" + "grain = dot(vec2(val, grain), w); \n" + "} \n"); + + // Correctly clip the interpolated grain + GLSL("grain = clamp(grain, "$", "$"); \n", grain_min, grain_max); + } + + if (c == PL_CHANNEL_Y) { + GLSL("color[%d] += "$"(color[%d]) * grain; \n" + "color[%d] = clamp(color[%d], "$", "$"); \n", + i, scaling[c], i, + i, i, minValue, maxLuma); + } else { + GLSL("val = averageLuma; \n"); + if (!data->chroma_scaling_from_luma) { + // We need to load some extra variables for the mixing. Do this + // using sh_var instead of hard-coding them to avoid shader + // recompilation when these values change. + ident_t mult = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("mult"), + .data = &(float[2]){ + data->uv_mult_luma[c - 1] / 64.0, + data->uv_mult[c - 1] / 64.0, + }, + }); + + int c_offset = (unsigned) data->uv_offset[c - 1] << (bits - 8); + ident_t offset = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("offset"), + .data = &(float) { c_offset * scale.grain_scale }, + }); + + GLSL("val = dot(vec2(val, color[%d]), "$"); \n" + "val += "$"; \n", + i, mult, offset); + } + GLSL("color[%d] += "$"(val) * grain; \n" + "color[%d] = clamp(color[%d], "$", "$"); \n", + i, scaling[c], + i, i, minValue, maxChroma); + } + } + + GLSL("} \n"); + return true; +} diff --git a/src/shaders/film_grain_h274.c b/src/shaders/film_grain_h274.c new file mode 100644 index 0000000..6d524da --- /dev/null +++ b/src/shaders/film_grain_h274.c @@ -0,0 +1,815 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" +#include "shaders/film_grain.h" + +static const int8_t Gaussian_LUT[2048+4]; +static const uint32_t Seed_LUT[256]; +static const int8_t R64T[64][64]; + +static void prng_shift(uint32_t *state) +{ + // Primitive polynomial x^31 + x^3 + 1 (modulo 2) + uint32_t x = *state; + uint8_t feedback = 1u ^ (x >> 2) ^ (x >> 30); + *state = (x << 1) | (feedback & 1u); +} + + +static void generate_slice(float *out, size_t out_width, uint8_t h, uint8_t v, + int8_t grain[64][64], int16_t tmp[64][64]) +{ + const uint8_t freq_h = ((h + 3) << 2) - 1; + const uint8_t freq_v = ((v + 3) << 2) - 1; + uint32_t seed = Seed_LUT[h + v * 13]; + + // Initialize with random gaussian values, using the output array as a + // temporary buffer for these intermediate values. + // + // Note: To make the subsequent matrix multiplication cache friendlier, we + // store each *column* of the starting image in a *row* of `grain` + for (int y = 0; y <= freq_v; y++) { + for (int x = 0; x <= freq_h; x += 4) { + uint16_t offset = seed % 2048; + grain[x + 0][y] = Gaussian_LUT[offset + 0]; + grain[x + 1][y] = Gaussian_LUT[offset + 1]; + grain[x + 2][y] = Gaussian_LUT[offset + 2]; + grain[x + 3][y] = Gaussian_LUT[offset + 3]; + prng_shift(&seed); + } + } + + grain[0][0] = 0; + + // 64x64 inverse integer transform + for (int y = 0; y < 64; y++) { + for (int x = 0; x <= freq_h; x++) { + int32_t sum = 0; + for (int p = 0; p <= freq_v; p++) + sum += R64T[y][p] * grain[x][p]; + tmp[y][x] = (sum + 128) >> 8; + } + } + + for (int y = 0; y < 64; y++) { + for (int x = 0; x < 64; x++) { + int32_t sum = 0; + for (int p = 0; p <= freq_h; p++) + sum += tmp[y][p] * R64T[x][p]; // R64T^T = R64 + sum = (sum + 128) >> 8; + grain[y][x] = PL_CLAMP(sum, -127, 127); + } + } + + static const uint8_t deblock_factors[13] = { + 64, 71, 77, 84, 90, 96, 103, 109, 116, 122, 128, 128, 128 + }; + + // Deblock horizontal edges by simple attentuation of values + const uint8_t deblock_coeff = deblock_factors[v]; + for (int y = 0; y < 64; y++) { + switch (y % 8) { + case 0: case 7: + // Deblock + for (int x = 0; x < 64; x++) + out[x] = ((grain[y][x] * deblock_coeff) >> 7) / 255.0; + break; + + case 1: case 2: + case 3: case 4: + case 5: case 6: + // No deblock + for (int x = 0; x < 64; x++) + out[x] = grain[y][x] / 255.0; + break; + + default: pl_unreachable(); + } + + out += out_width; + } +} + +static void fill_grain_lut(void *data, const struct sh_lut_params *params) +{ + struct { + int8_t grain[64][64]; + int16_t tmp[64][64]; + } *tmp = pl_alloc_ptr(NULL, tmp); + + float *out = data; + assert(params->var_type == PL_VAR_FLOAT); + + for (int h = 0; h < 13; h++) { + for (int v = 0; v < 13; v++) { + float *slice = out + (h * 64) * params->width + (v * 64); + generate_slice(slice, params->width, h, v, tmp->grain, tmp->tmp); + } + } + + pl_free(tmp); +} + +bool pl_needs_fg_h274(const struct pl_film_grain_params *params) +{ + const struct pl_h274_grain_data *data = ¶ms->data.params.h274; + if (data->model_id != 0) + return false; + + for (int i = 0; i < 3; i++) { + enum pl_channel channel = channel_map(i, params); + if (channel < 0 || channel >= 3) + continue; + if (data->component_model_present[channel]) + return true; + } + + return false; +} + +bool pl_shader_fg_h274(pl_shader sh, pl_shader_obj *grain_state, + const struct pl_film_grain_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_NONE, params->tex->params.w, params->tex->params.h)) + return false; + + size_t shmem_req = 0; + ident_t group_sum = NULL_IDENT; + + const struct pl_glsl_version glsl = sh_glsl(sh); + if (glsl.subgroup_size < 8*8) { + group_sum = sh_fresh(sh, "group_sum"); + shmem_req += sizeof(int); + GLSLH("shared int "$"; \n", group_sum); + GLSL($" = 0; barrier(); \n", group_sum); + } + + if (!sh_try_compute(sh, 8, 8, false, shmem_req)) { + SH_FAIL(sh, "H.274 film grain synthesis requires compute shaders!"); + return false; + } + + ident_t db = sh_lut(sh, sh_lut_params( + .object = grain_state, + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .width = 13 * 64, + .height = 13 * 64, + .comps = 1, + .fill = fill_grain_lut, + .signature = CACHE_KEY_H274, // doesn't depend on anything + .cache = SH_CACHE(sh), + )); + + sh_describe(sh, "H.274 film grain"); + GLSL("vec4 color; \n" + "// pl_shader_film_grain (H.274) \n" + "{ \n"); + + // Load the color value of the tex itself + ident_t tex = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = (struct pl_desc) { + .name = "tex", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + GLSL("ivec2 pos = ivec2(gl_GlobalInvocationID); \n" + "color = vec4("$") * texelFetch("$", pos, 0); \n", + SH_FLOAT(pl_color_repr_normalize(params->repr)), tex); + + const struct pl_h274_grain_data *data = ¶ms->data.params.h274; + ident_t scale_factor = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("scale_factor"), + .data = &(float){ 1.0 / (1 << (data->log2_scale_factor + 6)) }, + }); + + // pcg3d (http://www.jcgt.org/published/0009/03/02/) + GLSL("uvec3 pcg = uvec3("$", gl_WorkGroupID.xy / 2u); \n" + "pcg = pcg * 1664525u + 1013904223u; \n" + "pcg.x += pcg.y * pcg.z; \n" + "pcg.y += pcg.z * pcg.x; \n" + "pcg.z += pcg.x * pcg.y; \n" + "pcg ^= pcg >> 16u; \n" + "pcg.x += pcg.y * pcg.z; \n" + "pcg.y += pcg.z * pcg.x; \n" + "pcg.z += pcg.x * pcg.y; \n", + sh_var(sh, (struct pl_shader_var) { + .var = pl_var_uint("seed"), + .data = &(unsigned int){ params->data.seed }, + })); + + for (int idx = 0; idx < params->components; idx++) { + enum pl_channel c = channel_map(idx, params); + if (c == PL_CHANNEL_NONE) + continue; + if (!data->component_model_present[c]) + continue; + + GLSL("// component %d\n{\n", c); + + // Compute the local 8x8 average + GLSL("float avg = color[%d] / 64.0; \n", c); + + const int precision = 10000000; + if (glsl.subgroup_size) { + GLSL("avg = subgroupAdd(avg); \n"); + + if (glsl.subgroup_size < 8*8) { + GLSL("if (subgroupElect()) \n" + " atomicAdd("$", int(avg * %d.0)); \n" + "barrier(); \n" + "avg = float("$") / %d.0; \n", + group_sum, precision, group_sum, precision); + } + } else { + GLSL("atomicAdd("$", int(avg * %d.0)); \n" + "barrier(); \n" + "avg = float("$") / %d.0; \n", + group_sum, precision, group_sum, precision); + } + + // Hard-coded unrolled loop, to avoid having to load a dynamically + // sized array into the shader - and to optimize for the very common + // case of there only being a single intensity interval + GLSL("uint val; \n"); + for (int i = 0; i < data->num_intensity_intervals[c]; i++) { + ident_t bounds = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("bounds"), + .data = &(float[2]) { + data->intensity_interval_lower_bound[c][i] / 255.0, + data->intensity_interval_upper_bound[c][i] / 255.0, + }, + }); + + const uint8_t num_values = data->num_model_values[c]; + uint8_t h = num_values > 1 ? data->comp_model_value[c][i][1] : 8; + uint8_t v = num_values > 2 ? data->comp_model_value[c][i][2] : h; + h = PL_CLAMP(h, 2, 14) - 2; + v = PL_CLAMP(v, 2, 14) - 2; + // FIXME: double h/v for subsampled planes! + + // Reduce scale for chroma planes + int16_t scale = data->comp_model_value[c][i][0]; + if (c > 0 && pl_color_system_is_ycbcr_like(params->repr->sys)) + scale >>= 1; + + pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t)); + ident_t values = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_uint("comp_model_value"), + .data = &(unsigned int) { + (uint16_t) scale << 16 | h << 8 | v, + }, + }); + + GLSL("if (avg >= "$".x && avg <= "$".y) \n" + " val = "$"; else \n", + bounds, bounds, values); + } + GLSL(" val = 0u; \n"); + + // Extract the grain parameters from comp_model_value + GLSL("uvec2 offset = uvec2((val & 0xFF00u) >> 2, \n" + " (val & 0xFFu) << 6); \n" + "float scale = "$" * float(int(val >> 16)); \n" + // Add randomness + "uint rand = pcg[%d]; \n" + "offset.x += (rand >> 16u) %% 52u; \n" + "offset.y += (rand & 0xFFFFu) %% 56u; \n" + "offset.x &= 0xFFFCu; \n" + "offset.y &= 0xFFF8u; \n" + "if ((rand & 1u) == 1u) scale = -scale; \n" + // Add local offset and compute grain + "offset += 8u * (gl_WorkGroupID.xy %% 2u); \n" + "offset += gl_LocalInvocationID.xy; \n" + "float grain = "$"(offset); \n" + "color[%d] += scale * grain; \n", + scale_factor, c, db, c); + + // TODO: Deblocking? + + GLSL("}\n"); + } + + GLSL("} \n"); + return true; +} + +// These tables are all taken from the SMPTE RDD 5-2006 specification +static const int8_t Gaussian_LUT[2048+4] = { + -11, 12, 103, -11, 42, -35, 12, 59, 77, 98, -87, 3, 65, -78, 45, 56, -51, 21, + 13, -11, -20, -19, 33, -127, 17, -6, -105, 18, 19, 71, 48, -10, -38, 42, + -2, 75, -67, 52, -90, 33, -47, 21, -3, -56, 49, 1, -57, -42, -1, 120, -127, + -108, -49, 9, 14, 127, 122, 109, 52, 127, 2, 7, 114, 19, 30, 12, 77, 112, + 82, -61, -127, 111, -52, -29, 2, -49, -24, 58, -29, -73, 12, 112, 67, 79, + -3, -114, -87, -6, -5, 40, 58, -81, 49, -27, -31, -34, -105, 50, 16, -24, + -35, -14, -15, -127, -55, -22, -55, -127, -112, 5, -26, -72, 127, 127, -2, + 41, 87, -65, -16, 55, 19, 91, -81, -65, -64, 35, -7, -54, 99, -7, 88, 125, + -26, 91, 0, 63, 60, -14, -23, 113, -33, 116, 14, 26, 51, -16, 107, -8, 53, + 38, -34, 17, -7, 4, -91, 6, 63, 63, -15, 39, -36, 19, 55, 17, -51, 40, 33, + -37, 126, -39, -118, 17, -30, 0, 19, 98, 60, 101, -12, -73, -17, -52, 98, + 3, 3, 60, 33, -3, -2, 10, -42, -106, -38, 14, 127, 16, -127, -31, -86, -39, + -56, 46, -41, 75, 23, -19, -22, -70, 74, -54, -2, 32, -45, 17, -92, 59, + -64, -67, 56, -102, -29, -87, -34, -92, 68, 5, -74, -61, 93, -43, 14, -26, + -38, -126, -17, 16, -127, 64, 34, 31, 93, 17, -51, -59, 71, 77, 81, 127, + 127, 61, 33, -106, -93, 0, 0, 75, -69, 71, 127, -19, -111, 30, 23, 15, 2, + 39, 92, 5, 42, 2, -6, 38, 15, 114, -30, -37, 50, 44, 106, 27, 119, 7, -80, + 25, -68, -21, 92, -11, -1, 18, 41, -50, 79, -127, -43, 127, 18, 11, -21, + 32, -52, 27, -88, -90, -39, -19, -10, 24, -118, 72, -24, -44, 2, 12, 86, + -107, 39, -33, -127, 47, 51, -24, -22, 46, 0, 15, -35, -69, -2, -74, 24, + -6, 0, 29, -3, 45, 32, -32, 117, -45, 79, -24, -17, -109, -10, -70, 88, + -48, 24, -91, 120, -37, 50, -127, 58, 32, -82, -10, -17, -7, 46, -127, -15, + 89, 127, 17, 98, -39, -33, 37, 42, -40, -32, -21, 105, -19, 19, 19, -59, + -9, 30, 0, -127, 34, 127, -84, 75, 24, -40, -49, -127, -107, -14, 45, -75, + 1, 30, -20, 41, -68, -40, 12, 127, -3, 5, 20, -73, -59, -127, -3, -3, -53, + -6, -119, 93, 120, -80, -50, 0, 20, -46, 67, 78, -12, -22, -127, 36, -41, + 56, 119, -5, -116, -22, 68, -14, -90, 24, -82, -44, -127, 107, -25, -37, + 40, -7, -7, -82, 5, -87, 44, -34, 9, -127, 39, 70, 49, -63, 74, -49, 109, + -27, -89, -47, -39, 44, 49, -4, 60, -42, 80, 9, -127, -9, -56, -49, 125, + -66, 47, 36, 117, 15, -11, -96, 109, 94, -17, -56, 70, 8, -14, -5, 50, 37, + -45, 120, -30, -76, 40, -46, 6, 3, 69, 17, -78, 1, -79, 6, 127, 43, 26, + 127, -127, 28, -55, -26, 55, 112, 48, 107, -1, -77, -1, 53, -9, -22, -43, + 123, 108, 127, 102, 68, 46, 5, 1, 123, -13, -55, -34, -49, 89, 65, -105, + -5, 94, -53, 62, 45, 30, 46, 18, -35, 15, 41, 47, -98, -24, 94, -75, 127, + -114, 127, -68, 1, -17, 51, -95, 47, 12, 34, -45, -75, 89, -107, -9, -58, + -29, -109, -24, 127, -61, -13, 77, -45, 17, 19, 83, -24, 9, 127, -66, 54, + 4, 26, 13, 111, 43, -113, -22, 10, -24, 83, 67, -14, 75, -123, 59, 127, + -12, 99, -19, 64, -38, 54, 9, 7, 61, -56, 3, -57, 113, -104, -59, 3, -9, + -47, 74, 85, -55, -34, 12, 118, 28, 93, -72, 13, -99, -72, -20, 30, 72, + -94, 19, -54, 64, -12, -63, -25, 65, 72, -10, 127, 0, -127, 103, -20, -73, + -112, -103, -6, 28, -42, -21, -59, -29, -26, 19, -4, -51, 94, -58, -95, + -37, 35, 20, -69, 127, -19, -127, -22, -120, -53, 37, 74, -127, -1, -12, + -119, -53, -28, 38, 69, 17, 16, -114, 89, 62, 24, 37, -23, 49, -101, -32, + -9, -95, -53, 5, 93, -23, -49, -8, 51, 3, -75, -90, -10, -39, 127, -86, + -22, 20, 20, 113, 75, 52, -31, 92, -63, 7, -12, 46, 36, 101, -43, -17, -53, + -7, -38, -76, -31, -21, 62, 31, 62, 20, -127, 31, 64, 36, 102, -85, -10, + 77, 80, 58, -79, -8, 35, 8, 80, -24, -9, 3, -17, 72, 127, 83, -87, 55, 18, + -119, -123, 36, 10, 127, 56, -55, 113, 13, 26, 32, -13, -48, 22, -13, 5, + 58, 27, 24, 26, -11, -36, 37, -92, 78, 81, 9, 51, 14, 67, -13, 0, 32, 45, + -76, 32, -39, -22, -49, -127, -27, 31, -9, 36, 14, 71, 13, 57, 12, -53, + -86, 53, -44, -35, 2, 127, 12, -66, -44, 46, -115, 3, 10, 56, -35, 119, + -19, -61, 52, -59, -127, -49, -23, 4, -5, 17, -82, -6, 127, 25, 79, 67, 64, + -25, 14, -64, -37, -127, -28, 21, -63, 66, -53, -41, 109, -62, 15, -22, 13, + 29, -63, 20, 27, 95, -44, -59, -116, -10, 79, -49, 22, -43, -16, 46, -47, + -120, -36, -29, -52, -44, 29, 127, -13, 49, -9, -127, 75, -28, -23, 88, 59, + 11, -95, 81, -59, 58, 60, -26, 40, -92, -3, -22, -58, -45, -59, -22, -53, + 71, -29, 66, -32, -23, 14, -17, -66, -24, -28, -62, 47, 38, 17, 16, -37, + -24, -11, 8, -27, -19, 59, 45, -49, -47, -4, -22, -81, 30, -67, -127, 74, + 102, 5, -18, 98, 34, -66, 42, -52, 7, -59, 24, -58, -19, -24, -118, -73, + 91, 15, -16, 79, -32, -79, -127, -36, 41, 77, -83, 2, 56, 22, -75, 127, + -16, -21, 12, 31, 56, -113, -127, 90, 55, 61, 12, 55, -14, -113, -14, 32, + 49, -67, -17, 91, -10, 1, 21, 69, -70, 99, -19, -112, 66, -90, -10, -9, + -71, 127, 50, -81, -49, 24, 61, -61, -111, 7, -41, 127, 88, -66, 108, -127, + -6, 36, -14, 41, -50, 14, 14, 73, -101, -28, 77, 127, -8, -100, 88, 38, + 121, 88, -125, -60, 13, -94, -115, 20, -67, -87, -94, -119, 44, -28, -30, + 18, 5, -53, -61, 20, -43, 11, -77, -60, 13, 29, 3, 6, -72, 38, -60, -11, + 108, -53, 41, 66, -12, -127, -127, -49, 24, 29, 46, 36, 91, 34, -33, 116, + -51, -34, -52, 91, 7, -83, 73, -26, -103, 24, -10, 76, 84, 5, 68, -80, -13, + -17, -32, -48, 20, 50, 26, 10, 63, -104, -14, 37, 127, 114, 97, 35, 1, -33, + -55, 127, -124, -33, 61, -7, 119, -32, -127, -53, -42, 63, 3, -5, -26, 70, + -58, -33, -44, -43, 34, -56, -127, 127, 25, -35, -11, 16, -81, 29, -58, 40, + -127, -127, 20, -47, -11, -36, -63, -52, -32, -82, 78, -76, -73, 8, 27, + -72, -9, -74, -85, -86, -57, 25, 78, -10, -97, 35, -65, 8, -59, 14, 1, -42, + 32, -88, -44, 17, -3, -9, 59, 40, 12, -108, -40, 24, 34, 18, -28, 2, 51, + -110, -4, 100, 1, 65, 22, 0, 127, 61, 45, 25, -31, 6, 9, -7, -48, 99, 16, + 44, -2, -40, 32, -39, -52, 10, -110, -19, 56, -127, 69, 26, 51, 92, 40, 61, + -52, 45, -38, 13, 85, 122, 27, 66, 45, -111, -83, -3, 31, 37, 19, -36, 58, + 71, 39, -78, -47, 58, -78, 8, -62, -36, -14, 61, 42, -127, 71, -4, 24, -54, + 52, -127, 67, -4, -42, 30, -63, 59, -3, -1, -18, -46, -92, -81, -96, -14, + -53, -10, -11, -77, 13, 1, 8, -67, -127, 127, -28, 26, -14, 18, -13, -26, + 2, 10, -46, -32, -15, 27, -31, -59, 59, 77, -121, 28, 40, -54, -62, -31, + -21, -37, -32, -6, -127, -25, -60, 70, -127, 112, -127, 127, 88, -7, 116, + 110, 53, 87, -127, 3, 16, 23, 74, -106, -51, 3, 74, -82, -112, -74, 65, 81, + 25, 53, 127, -45, -50, -103, -41, -65, -29, 79, -67, 64, -33, -30, -8, 127, + 0, -13, -51, 67, -14, 5, -92, 29, -35, -8, -90, -57, -3, 36, 43, 44, -31, + -69, -7, 36, 39, -51, 43, -81, 58, 6, 127, 12, 57, 66, 46, 59, -43, -42, + 41, -15, -120, 24, 3, -11, 19, -13, 51, 28, 3, 55, -48, -12, -1, 2, 97, + -19, 29, 42, 13, 43, 78, -44, 56, -108, -43, -19, 127, 15, -11, -18, -81, + 83, -37, 77, -109, 15, 65, -50, 43, 12, 13, 27, 28, 61, 57, 30, 26, 106, + -18, 56, 13, 97, 4, -8, -62, -103, 94, 108, -44, 52, 27, -47, -9, 105, -53, + 46, 89, 103, -33, 38, -34, 55, 51, 70, -94, -35, -87, -107, -19, -31, 9, + -19, 79, -14, 77, 5, -19, -107, 85, 21, -45, -39, -42, 9, -29, 74, 47, -75, + 60, -127, 120, -112, -57, -32, 41, 7, 79, 76, 66, 57, 41, -25, 31, 37, -47, + -36, 43, -73, -37, 63, 127, -69, -52, 90, -33, -61, 60, -55, 44, 15, 4, + -67, 13, -92, 64, 29, -39, -3, 83, -2, -38, -85, -86, 58, 35, -69, -61, 29, + -37, -95, -78, 4, 30, -4, -32, -80, -22, -9, -77, 46, 7, -93, -71, 65, 9, + -50, 127, -70, 26, -12, -39, -114, 63, -127, -100, 4, -32, 111, 22, -60, + 65, -101, 26, -42, 21, -59, -27, -74, 2, -94, 6, 126, 5, 76, -88, -9, -43, + -101, 127, 1, 125, 92, -63, 52, 56, 4, 81, -127, 127, 80, 127, -29, 30, + 116, -74, -17, -57, 105, 48, 45, 25, -72, 48, -38, -108, 31, -34, 4, -11, + 41, -127, 52, -104, -43, -37, 52, 2, 47, 87, -9, 77, 27, -41, -25, 90, 86, + -56, 75, 10, 33, 78, 58, 127, 127, -7, -73, 49, -33, -106, -35, 38, 57, 53, + -17, -4, 83, 52, -108, 54, -125, 28, 23, 56, -43, -88, -17, -6, 47, 23, -9, + 0, -13, 111, 75, 27, -52, -38, -34, 39, 30, 66, 39, 38, -64, 38, 3, 21, + -32, -51, -28, 54, -38, -87, 20, 52, 115, 18, -81, -70, 0, -14, -46, -46, + -3, 125, 16, -14, 23, -82, -84, -69, -20, -65, -127, 9, 81, -49, 61, 7, + -36, -45, -42, 57, -26, 47, 20, -85, 46, -13, 41, -37, -75, -60, 86, -78, + -127, 12, 50, 2, -3, 13, 47, 5, 19, -78, -55, -27, 65, -71, 12, -108, 20, + -16, 11, -31, 63, -55, 37, 75, -17, 127, -73, -33, -28, -120, 105, 68, 106, + -103, -106, 71, 61, 2, 23, -3, 33, -5, -15, -67, -15, -23, -54, 15, -63, + 76, 58, -110, 1, 83, -27, 22, 75, -39, -17, -11, 64, -17, -127, -54, -66, + 31, 96, 116, 3, -114, -7, -108, -63, 97, 9, 50, 8, 75, -28, 72, 112, -36, + -112, 95, -50, 23, -13, -19, 55, 21, 23, 92, 91, 22, -49, 16, -75, 23, 9, + -49, -97, -37, 49, -36, 36, -127, -86, 43, 127, -24, -24, 84, 83, -35, -34, + -12, 109, 102, -38, 51, -68, 34, 19, -22, 49, -32, 127, 40, 24, -93, -4, + -3, 105, 3, -58, -18, 8, 127, -18, 125, 68, 69, -62, 30, -36, 54, -57, -24, + 17, 43, -36, -27, -57, -67, -21, -10, -49, 68, 12, 65, 4, 48, 55, 127, -75, + 44, 89, -66, -13, -78, -82, -91, 22, 30, 33, -40, -87, -34, 96, -91, 39, + 10, -64, -3, -12, 127, -50, -37, -56, 23, -35, -36, -54, 90, -91, 2, 50, + 77, -6, -127, 16, 46, -5, -73, 0, -56, -18, -72, 28, 93, 60, 49, 20, 18, + 111, -111, 32, -83, 47, 47, -10, 35, -88, 43, 57, -98, 127, -17, 0, 1, -39, + -127, -2, 0, 63, 93, 0, 36, -66, -61, -19, 39, -127, 58, 50, -17, 127, 88, + -43, -108, -51, -16, 7, -36, 68, 46, -14, 107, 40, 57, 7, 19, 8, 3, 88, + -90, -92, -18, -21, -24, 13, 7, -4, -78, -91, -4, 8, -35, -5, 19, 2, -111, + 4, -66, -81, 122, -20, -34, -37, -84, 127, 68, 46, 17, 47, + + // Repeat the beginning of the array to allow wrapping reads + -11, 12, 103, -11, +}; + +static const uint32_t Seed_LUT[256] = { + 747538460, 1088979410, 1744950180, 1767011913, 1403382928, + 521866116, 1060417601, 2110622736, 1557184770, 105289385, 585624216, + 1827676546, 1191843873, 1018104344, 1123590530, 663361569, 2023850500, + 76561770, 1226763489, 80325252, 1992581442, 502705249, 740409860, + 516219202, 557974537, 1883843076, 720112066, 1640137737, 1820967556, + 40667586, 155354121, 1820967557, 1115949072, 1631803309, 98284748, + 287433856, 2119719977, 988742797, 1827432592, 579378475, 1017745956, + 1309377032, 1316535465, 2074315269, 1923385360, 209722667, 1546228260, + 168102420, 135274561, 355958469, 248291472, 2127839491, 146920100, + 585982612, 1611702337, 696506029, 1386498192, 1258072451, 1212240548, + 1043171860, 1217404993, 1090770605, 1386498193, 169093201, 541098240, + 1468005469, 456510673, 1578687785, 1838217424, 2010752065, 2089828354, + 1362717428, 970073673, 854129835, 714793201, 1266069081, 1047060864, + 1991471829, 1098097741, 913883585, 1669598224, 1337918685, 1219264706, + 1799741108, 1834116681, 683417731, 1120274457, 1073098457, 1648396544, + 176642749, 31171789, 718317889, 1266977808, 1400892508, 549749008, + 1808010512, 67112961, 1005669825, 903663673, 1771104465, 1277749632, + 1229754427, 950632997, 1979371465, 2074373264, 305357524, 1049387408, + 1171033360, 1686114305, 2147468765, 1941195985, 117709841, 809550080, + 991480851, 1816248997, 1561503561, 329575568, 780651196, 1659144592, + 1910793616, 604016641, 1665084765, 1530186961, 1870928913, 809550081, + 2079346113, 71307521, 876663040, 1073807360, 832356664, 1573927377, + 204073344, 2026918147, 1702476788, 2043881033, 57949587, 2001393952, + 1197426649, 1186508931, 332056865, 950043140, 890043474, 349099312, + 148914948, 236204097, 2022643605, 1441981517, 498130129, 1443421481, + 924216797, 1817491777, 1913146664, 1411989632, 929068432, 495735097, + 1684636033, 1284520017, 432816184, 1344884865, 210843729, 676364544, + 234449232, 12112337, 1350619139, 1753272996, 2037118872, 1408560528, + 533334916, 1043640385, 357326099, 201376421, 110375493, 541106497, + 416159637, 242512193, 777294080, 1614872576, 1535546636, 870600145, + 910810409, 1821440209, 1605432464, 1145147393, 951695441, 1758494976, + 1506656568, 1557150160, 608221521, 1073840384, 217672017, 684818688, + 1750138880, 16777217, 677990609, 953274371, 1770050213, 1359128393, + 1797602707, 1984616737, 1865815816, 2120835200, 2051677060, 1772234061, + 1579794881, 1652821009, 1742099468, 1887260865, 46468113, 1011925248, + 1134107920, 881643832, 1354774993, 472508800, 1892499769, 1752793472, + 1962502272, 687898625, 883538000, 1354355153, 1761673473, 944820481, + 2020102353, 22020353, 961597696, 1342242816, 964808962, 1355809701, + 17016649, 1386540177, 647682692, 1849012289, 751668241, 1557184768, + 127374604, 1927564752, 1045744913, 1614921984, 43588881, 1016185088, + 1544617984, 1090519041, 136122424, 215038417, 1563027841, 2026918145, + 1688778833, 701530369, 1372639488, 1342242817, 2036945104, 953274369, + 1750192384, 16842753, 964808960, 1359020032, 1358954497 +}; + +// Note: This is pre-transposed, i.e. stored column-major order +static const int8_t R64T[64][64] = { + { + 32, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 42, + 42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 36, 35, 34, 34, 33, + 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, + 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1, + }, { + 32, 45, 45, 44, 43, 42, 41, 39, 38, 36, 34, 31, 29, 26, 23, 20, + 17, 14, 11, 8, 4, 1, -2, -6, -9, -12, -15, -18, -21, -24, -27, -30, + -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43, + -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10, -7, -3, + }, { + 32, 45, 44, 42, 40, 37, 34, 30, 25, 20, 15, 10, 4, -1, -7, -12, + -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36, + -32, -28, -23, -18, -13, -8, -2, 3, 9, 14, 19, 24, 29, 33, 36, 39, + 42, 44, 45, 45, 45, 44, 43, 40, 38, 34, 30, 26, 21, 16, 11, 6, + }, { + 32, 45, 43, 39, 35, 30, 23, 16, 9, 1, -7, -14, -21, -28, -34, -38, + -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11, -3, 4, 12, 19, 26, + 32, 37, 41, 44, 45, 45, 44, 41, 38, 33, 27, 20, 13, 6, -2, -10, + -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15, -8, + }, { + 32, 44, 41, 36, 29, 20, 11, 1, -9, -18, -27, -34, -40, -44, -45, -45, + -42, -37, -30, -22, -13, -3, 7, 16, 25, 33, 39, 43, 45, 45, 43, 38, + 32, 24, 15, 6, -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26, + -17, -8, 2, 12, 21, 30, 36, 41, 44, 45, 44, 40, 35, 28, 19, 10, + }, { + 32, 44, 39, 31, 21, 10, -2, -14, -25, -34, -41, -45, -45, -42, -36, -28, + -17, -6, 7, 18, 29, 37, 43, 45, 44, 40, 34, 24, 13, 1, -11, -22, + -32, -39, -44, -45, -43, -38, -30, -20, -9, 3, 15, 26, 35, 41, 45, 45, + 42, 36, 27, 16, 4, -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12, + }, { + 32, 43, 36, 26, 13, -1, -15, -28, -38, -44, -45, -42, -35, -24, -11, 3, + 17, 30, 39, 44, 45, 41, 34, 22, 9, -6, -19, -31, -40, -45, -45, -40, + -32, -20, -7, 8, 21, 33, 41, 45, 44, 39, 30, 18, 4, -10, -23, -34, + -42, -45, -44, -38, -29, -16, -2, 12, 25, 36, 43, 45, 43, 37, 27, 14, + }, { + 32, 42, 34, 20, 4, -12, -27, -38, -44, -45, -39, -28, -13, 3, 19, 33, + 42, 45, 43, 34, 21, 6, -11, -26, -38, -44, -45, -39, -29, -14, 2, 18, + 32, 41, 45, 43, 35, 22, 7, -10, -25, -37, -44, -45, -40, -30, -15, 1, + 17, 31, 41, 45, 43, 36, 23, 8, -9, -24, -36, -44, -45, -40, -30, -16, + }, { + 32, 41, 30, 14, -4, -22, -36, -44, -44, -37, -23, -6, 13, 30, 41, 45, + 42, 31, 15, -3, -21, -36, -44, -45, -38, -24, -7, 12, 29, 40, 45, 42, + 32, 16, -2, -20, -35, -44, -45, -38, -25, -8, 11, 28, 40, 45, 43, 33, + 17, -1, -19, -34, -43, -45, -39, -26, -9, 10, 27, 39, 45, 43, 34, 18, + }, { + 32, 40, 27, 8, -13, -31, -43, -45, -38, -22, -2, 18, 35, 44, 44, 34, + 17, -3, -23, -38, -45, -42, -30, -12, 9, 28, 41, 45, 40, 26, 7, -14, + -32, -43, -45, -37, -21, -1, 19, 36, 44, 44, 34, 16, -4, -24, -39, -45, + -42, -30, -11, 10, 29, 41, 45, 39, 25, 6, -15, -33, -43, -45, -36, -20, + }, { + 32, 39, 23, 1, -21, -38, -45, -40, -25, -3, 19, 37, 45, 41, 27, 6, + -17, -36, -45, -42, -29, -8, 15, 34, 44, 43, 30, 10, -13, -33, -44, -44, + -32, -12, 11, 31, 43, 44, 34, 14, -9, -30, -43, -45, -35, -16, 7, 28, + 42, 45, 36, 18, -4, -26, -41, -45, -38, -20, 2, 24, 40, 45, 39, 22, + }, { + 32, 38, 19, -6, -29, -43, -44, -31, -9, 16, 36, 45, 40, 22, -2, -26, + -42, -45, -34, -12, 13, 34, 45, 41, 25, 1, -23, -40, -45, -36, -15, 10, + 32, 44, 43, 28, 4, -20, -39, -45, -38, -18, 7, 30, 43, 44, 30, 8, + -17, -37, -45, -39, -21, 3, 27, 42, 44, 33, 11, -14, -35, -45, -41, -24, + }, { + 32, 37, 15, -12, -35, -45, -39, -18, 9, 33, 45, 40, 21, -6, -30, -44, + -42, -24, 2, 28, 43, 43, 27, 1, -25, -42, -44, -30, -4, 22, 41, 45, + 32, 8, -19, -39, -45, -34, -11, 16, 38, 45, 36, 14, -13, -36, -45, -38, + -17, 10, 34, 45, 40, 20, -7, -31, -44, -41, -23, 3, 29, 44, 43, 26, + }, { + 32, 36, 11, -18, -40, -45, -30, -3, 25, 43, 43, 24, -4, -31, -45, -39, + -17, 12, 36, 45, 35, 10, -19, -40, -44, -30, -2, 26, 43, 42, 23, -6, + -32, -45, -39, -16, 13, 37, 45, 34, 9, -20, -41, -44, -29, -1, 27, 44, + 42, 22, -7, -33, -45, -38, -15, 14, 38, 45, 34, 8, -21, -41, -44, -28, + }, { + 32, 34, 7, -24, -43, -41, -19, 12, 38, 45, 30, 1, -29, -45, -39, -14, + 17, 40, 44, 26, -4, -33, -45, -36, -9, 22, 43, 42, 21, -10, -36, -45, + -32, -3, 27, 44, 40, 16, -15, -39, -44, -28, 2, 31, 45, 37, 11, -20, + -42, -43, -23, 8, 35, 45, 34, 6, -25, -44, -41, -18, 13, 38, 45, 30, + }, { + 32, 33, 2, -30, -45, -36, -7, 26, 44, 38, 11, -22, -43, -40, -15, 18, + 42, 42, 19, -14, -40, -44, -23, 10, 38, 45, 27, -6, -35, -45, -30, 1, + 32, 45, 34, 3, -29, -45, -36, -8, 25, 44, 39, 12, -21, -43, -41, -16, + 17, 41, 43, 20, -13, -39, -44, -24, 9, 37, 45, 28, -4, -34, -45, -31, + }, { + 32, 31, -2, -34, -45, -28, 7, 37, 44, 24, -11, -39, -43, -20, 15, 41, + 42, 16, -19, -43, -40, -12, 23, 44, 38, 8, -27, -45, -35, -3, 30, 45, + 32, -1, -34, -45, -29, 6, 36, 45, 25, -10, -39, -44, -21, 14, 41, 42, + 17, -18, -43, -40, -13, 22, 44, 38, 9, -26, -45, -36, -4, 30, 45, 33, + }, { + 32, 30, -7, -38, -43, -18, 19, 44, 38, 6, -30, -45, -29, 8, 39, 43, + 17, -20, -44, -37, -4, 31, 45, 28, -9, -39, -43, -16, 21, 44, 36, 3, + -32, -45, -27, 10, 40, 42, 15, -22, -44, -36, -2, 33, 45, 26, -11, -40, + -42, -14, 23, 45, 35, 1, -34, -45, -25, 12, 41, 41, 13, -24, -45, -34, + }, { + 32, 28, -11, -41, -40, -8, 30, 45, 25, -14, -43, -38, -4, 33, 45, 22, + -17, -44, -36, -1, 35, 44, 19, -20, -44, -34, 2, 37, 43, 16, -23, -45, + -32, 6, 39, 42, 13, -26, -45, -30, 9, 40, 41, 10, -29, -45, -27, 12, + 42, 39, 7, -31, -45, -24, 15, 43, 38, 3, -34, -45, -21, 18, 44, 36, + }, { + 32, 26, -15, -44, -35, 3, 39, 41, 9, -31, -45, -20, 21, 45, 30, -10, + -42, -38, -2, 36, 43, 14, -27, -45, -25, 16, 44, 34, -4, -39, -41, -8, + 32, 45, 19, -22, -45, -30, 11, 42, 38, 1, -36, -43, -13, 28, 45, 24, + -17, -44, -34, 6, 40, 40, 7, -33, -44, -18, 23, 45, 29, -12, -43, -37, + }, { + 32, 24, -19, -45, -29, 14, 44, 33, -9, -42, -36, 3, 40, 39, 2, -37, + -42, -8, 34, 44, 13, -30, -45, -18, 25, 45, 23, -20, -45, -28, 15, 44, + 32, -10, -43, -36, 4, 40, 39, 1, -38, -41, -7, 34, 43, 12, -30, -45, + -17, 26, 45, 22, -21, -45, -27, 16, 44, 31, -11, -43, -35, 6, 41, 38, + }, { + 32, 22, -23, -45, -21, 24, 45, 20, -25, -45, -19, 26, 45, 18, -27, -45, + -17, 28, 45, 16, -29, -45, -15, 30, 44, 14, -30, -44, -13, 31, 44, 12, + -32, -44, -11, 33, 43, 10, -34, -43, -9, 34, 43, 8, -35, -42, -7, 36, + 42, 6, -36, -41, -4, 37, 41, 3, -38, -40, -2, 38, 40, 1, -39, -39, + }, { + 32, 20, -27, -45, -13, 33, 43, 6, -38, -39, 2, 41, 35, -10, -44, -30, + 17, 45, 23, -24, -45, -16, 30, 44, 9, -36, -41, -1, 40, 37, -7, -43, + -32, 14, 45, 26, -21, -45, -19, 28, 44, 12, -34, -42, -4, 38, 39, -3, + -42, -34, 11, 44, 29, -18, -45, -22, 25, 45, 15, -31, -43, -8, 36, 40, + }, { + 32, 18, -30, -43, -4, 39, 36, -10, -44, -26, 23, 45, 13, -34, -41, 1, + 42, 33, -15, -45, -21, 28, 44, 8, -38, -38, 7, 44, 29, -20, -45, -16, + 32, 42, 2, -40, -35, 12, 45, 24, -25, -45, -11, 36, 40, -3, -43, -31, + 17, 45, 19, -30, -43, -6, 39, 37, -9, -44, -27, 22, 45, 14, -34, -41, + }, { + 32, 16, -34, -40, 4, 44, 27, -24, -44, -8, 39, 36, -13, -45, -19, 31, + 42, -1, -43, -30, 21, 45, 11, -37, -38, 10, 45, 22, -29, -43, -2, 41, + 32, -18, -45, -14, 35, 39, -7, -44, -25, 26, 44, 6, -40, -34, 15, 45, + 17, -33, -41, 3, 43, 28, -23, -45, -9, 38, 36, -12, -45, -20, 30, 42, + }, { + 32, 14, -36, -37, 13, 45, 15, -36, -38, 12, 45, 16, -35, -38, 11, 45, + 17, -34, -39, 10, 45, 18, -34, -39, 9, 45, 19, -33, -40, 8, 45, 20, + -32, -40, 7, 45, 21, -31, -41, 6, 44, 22, -30, -41, 4, 44, 23, -30, + -42, 3, 44, 24, -29, -42, 2, 44, 25, -28, -43, 1, 43, 26, -27, -43, + }, { + 32, 12, -39, -33, 21, 44, 2, -43, -25, 30, 41, -8, -45, -16, 36, 36, + -17, -45, -7, 41, 29, -26, -43, 3, 44, 20, -34, -38, 13, 45, 11, -39, + -32, 22, 44, 1, -43, -24, 30, 40, -9, -45, -15, 37, 35, -18, -45, -6, + 42, 28, -27, -42, 4, 45, 19, -34, -38, 14, 45, 10, -40, -31, 23, 44, + }, { + 32, 10, -41, -28, 29, 40, -11, -45, -9, 41, 27, -30, -40, 12, 45, 8, + -42, -26, 30, 39, -13, -45, -7, 42, 25, -31, -39, 14, 45, 6, -43, -24, + 32, 38, -15, -45, -4, 43, 23, -33, -38, 16, 45, 3, -43, -22, 34, 37, + -17, -45, -2, 44, 21, -34, -36, 18, 44, 1, -44, -20, 35, 36, -19, -44, + }, { + 32, 8, -43, -22, 35, 34, -23, -42, 9, 45, 7, -43, -21, 36, 34, -24, + -42, 10, 45, 6, -43, -20, 36, 33, -25, -41, 11, 45, 4, -44, -19, 37, + 32, -26, -41, 12, 45, 3, -44, -18, 38, 31, -27, -40, 13, 45, 2, -44, + -17, 38, 30, -28, -40, 14, 45, 1, -44, -16, 39, 30, -29, -39, 15, 45, + }, { + 32, 6, -44, -16, 40, 26, -34, -34, 25, 40, -15, -44, 4, 45, 7, -44, + -17, 39, 27, -33, -35, 24, 41, -14, -44, 3, 45, 8, -43, -18, 39, 28, + -32, -36, 23, 41, -13, -45, 2, 45, 9, -43, -19, 38, 29, -31, -36, 22, + 42, -12, -45, 1, 45, 10, -43, -20, 38, 30, -30, -37, 21, 42, -11, -45, + }, { + 32, 3, -45, -10, 43, 16, -41, -22, 38, 28, -34, -33, 29, 37, -23, -40, + 17, 43, -11, -45, 4, 45, 2, -45, -9, 44, 15, -41, -21, 38, 27, -34, + -32, 30, 36, -24, -40, 18, 43, -12, -44, 6, 45, 1, -45, -8, 44, 14, + -42, -20, 39, 26, -35, -31, 30, 36, -25, -39, 19, 42, -13, -44, 7, 45, + }, { + 32, 1, -45, -3, 45, 6, -45, -8, 44, 10, -44, -12, 43, 14, -43, -16, + 42, 18, -41, -20, 40, 22, -39, -24, 38, 26, -36, -28, 35, 30, -34, -31, + 32, 33, -30, -34, 29, 36, -27, -37, 25, 38, -23, -39, 21, 40, -19, -41, + 17, 42, -15, -43, 13, 44, -11, -44, 9, 45, -7, -45, 4, 45, -2, -45, + }, { + 32, -1, -45, 3, 45, -6, -45, 8, 44, -10, -44, 12, 43, -14, -43, 16, + 42, -18, -41, 20, 40, -22, -39, 24, 38, -26, -36, 28, 35, -30, -34, 31, + 32, -33, -30, 34, 29, -36, -27, 37, 25, -38, -23, 39, 21, -40, -19, 41, + 17, -42, -15, 43, 13, -44, -11, 44, 9, -45, -7, 45, 4, -45, -2, 45, + }, { + 32, -3, -45, 10, 43, -16, -41, 22, 38, -28, -34, 33, 29, -37, -23, 40, + 17, -43, -11, 45, 4, -45, 2, 45, -9, -44, 15, 41, -21, -38, 27, 34, + -32, -30, 36, 24, -40, -18, 43, 12, -44, -6, 45, -1, -45, 8, 44, -14, + -42, 20, 39, -26, -35, 31, 30, -36, -25, 39, 19, -42, -13, 44, 7, -45, + }, { + 32, -6, -44, 16, 40, -26, -34, 34, 25, -40, -15, 44, 4, -45, 7, 44, + -17, -39, 27, 33, -35, -24, 41, 14, -44, -3, 45, -8, -43, 18, 39, -28, + -32, 36, 23, -41, -13, 45, 2, -45, 9, 43, -19, -38, 29, 31, -36, -22, + 42, 12, -45, -1, 45, -10, -43, 20, 38, -30, -30, 37, 21, -42, -11, 45, + }, { + 32, -8, -43, 22, 35, -34, -23, 42, 9, -45, 7, 43, -21, -36, 34, 24, + -42, -10, 45, -6, -43, 20, 36, -33, -25, 41, 11, -45, 4, 44, -19, -37, + 32, 26, -41, -12, 45, -3, -44, 18, 38, -31, -27, 40, 13, -45, 2, 44, + -17, -38, 30, 28, -40, -14, 45, -1, -44, 16, 39, -30, -29, 39, 15, -45, + }, { + 32, -10, -41, 28, 29, -40, -11, 45, -9, -41, 27, 30, -40, -12, 45, -8, + -42, 26, 30, -39, -13, 45, -7, -42, 25, 31, -39, -14, 45, -6, -43, 24, + 32, -38, -15, 45, -4, -43, 23, 33, -38, -16, 45, -3, -43, 22, 34, -37, + -17, 45, -2, -44, 21, 34, -36, -18, 44, -1, -44, 20, 35, -36, -19, 44, + }, { + 32, -12, -39, 33, 21, -44, 2, 43, -25, -30, 41, 8, -45, 16, 36, -36, + -17, 45, -7, -41, 29, 26, -43, -3, 44, -20, -34, 38, 13, -45, 11, 39, + -32, -22, 44, -1, -43, 24, 30, -40, -9, 45, -15, -37, 35, 18, -45, 6, + 42, -28, -27, 42, 4, -45, 19, 34, -38, -14, 45, -10, -40, 31, 23, -44, + }, { + 32, -14, -36, 37, 13, -45, 15, 36, -38, -12, 45, -16, -35, 38, 11, -45, + 17, 34, -39, -10, 45, -18, -34, 39, 9, -45, 19, 33, -40, -8, 45, -20, + -32, 40, 7, -45, 21, 31, -41, -6, 44, -22, -30, 41, 4, -44, 23, 30, + -42, -3, 44, -24, -29, 42, 2, -44, 25, 28, -43, -1, 43, -26, -27, 43, + }, { + 32, -16, -34, 40, 4, -44, 27, 24, -44, 8, 39, -36, -13, 45, -19, -31, + 42, 1, -43, 30, 21, -45, 11, 37, -38, -10, 45, -22, -29, 43, -2, -41, + 32, 18, -45, 14, 35, -39, -7, 44, -25, -26, 44, -6, -40, 34, 15, -45, + 17, 33, -41, -3, 43, -28, -23, 45, -9, -38, 36, 12, -45, 20, 30, -42, + }, { + 32, -18, -30, 43, -4, -39, 36, 10, -44, 26, 23, -45, 13, 34, -41, -1, + 42, -33, -15, 45, -21, -28, 44, -8, -38, 38, 7, -44, 29, 20, -45, 16, + 32, -42, 2, 40, -35, -12, 45, -24, -25, 45, -11, -36, 40, 3, -43, 31, + 17, -45, 19, 30, -43, 6, 39, -37, -9, 44, -27, -22, 45, -14, -34, 41, + }, { + 32, -20, -27, 45, -13, -33, 43, -6, -38, 39, 2, -41, 35, 10, -44, 30, + 17, -45, 23, 24, -45, 16, 30, -44, 9, 36, -41, 1, 40, -37, -7, 43, + -32, -14, 45, -26, -21, 45, -19, -28, 44, -12, -34, 42, -4, -38, 39, 3, + -42, 34, 11, -44, 29, 18, -45, 22, 25, -45, 15, 31, -43, 8, 36, -40, + }, { + 32, -22, -23, 45, -21, -24, 45, -20, -25, 45, -19, -26, 45, -18, -27, 45, + -17, -28, 45, -16, -29, 45, -15, -30, 44, -14, -30, 44, -13, -31, 44, -12, + -32, 44, -11, -33, 43, -10, -34, 43, -9, -34, 43, -8, -35, 42, -7, -36, + 42, -6, -36, 41, -4, -37, 41, -3, -38, 40, -2, -38, 40, -1, -39, 39, + }, { + 32, -24, -19, 45, -29, -14, 44, -33, -9, 42, -36, -3, 40, -39, 2, 37, + -42, 8, 34, -44, 13, 30, -45, 18, 25, -45, 23, 20, -45, 28, 15, -44, + 32, 10, -43, 36, 4, -40, 39, -1, -38, 41, -7, -34, 43, -12, -30, 45, + -17, -26, 45, -22, -21, 45, -27, -16, 44, -31, -11, 43, -35, -6, 41, -38, + }, { + 32, -26, -15, 44, -35, -3, 39, -41, 9, 31, -45, 20, 21, -45, 30, 10, + -42, 38, -2, -36, 43, -14, -27, 45, -25, -16, 44, -34, -4, 39, -41, 8, + 32, -45, 19, 22, -45, 30, 11, -42, 38, -1, -36, 43, -13, -28, 45, -24, + -17, 44, -34, -6, 40, -40, 7, 33, -44, 18, 23, -45, 29, 12, -43, 37, + }, { + 32, -28, -11, 41, -40, 8, 30, -45, 25, 14, -43, 38, -4, -33, 45, -22, + -17, 44, -36, 1, 35, -44, 19, 20, -44, 34, 2, -37, 43, -16, -23, 45, + -32, -6, 39, -42, 13, 26, -45, 30, 9, -40, 41, -10, -29, 45, -27, -12, + 42, -39, 7, 31, -45, 24, 15, -43, 38, -3, -34, 45, -21, -18, 44, -36, + }, { + 32, -30, -7, 38, -43, 18, 19, -44, 38, -6, -30, 45, -29, -8, 39, -43, + 17, 20, -44, 37, -4, -31, 45, -28, -9, 39, -43, 16, 21, -44, 36, -3, + -32, 45, -27, -10, 40, -42, 15, 22, -44, 36, -2, -33, 45, -26, -11, 40, + -42, 14, 23, -45, 35, -1, -34, 45, -25, -12, 41, -41, 13, 24, -45, 34, + }, { + 32, -31, -2, 34, -45, 28, 7, -37, 44, -24, -11, 39, -43, 20, 15, -41, + 42, -16, -19, 43, -40, 12, 23, -44, 38, -8, -27, 45, -35, 3, 30, -45, + 32, 1, -34, 45, -29, -6, 36, -45, 25, 10, -39, 44, -21, -14, 41, -42, + 17, 18, -43, 40, -13, -22, 44, -38, 9, 26, -45, 36, -4, -30, 45, -33, + }, { + 32, -33, 2, 30, -45, 36, -7, -26, 44, -38, 11, 22, -43, 40, -15, -18, + 42, -42, 19, 14, -40, 44, -23, -10, 38, -45, 27, 6, -35, 45, -30, -1, + 32, -45, 34, -3, -29, 45, -36, 8, 25, -44, 39, -12, -21, 43, -41, 16, + 17, -41, 43, -20, -13, 39, -44, 24, 9, -37, 45, -28, -4, 34, -45, 31, + }, { + 32, -34, 7, 24, -43, 41, -19, -12, 38, -45, 30, -1, -29, 45, -39, 14, + 17, -40, 44, -26, -4, 33, -45, 36, -9, -22, 43, -42, 21, 10, -36, 45, + -32, 3, 27, -44, 40, -16, -15, 39, -44, 28, 2, -31, 45, -37, 11, 20, + -42, 43, -23, -8, 35, -45, 34, -6, -25, 44, -41, 18, 13, -38, 45, -30, + }, { + 32, -36, 11, 18, -40, 45, -30, 3, 25, -43, 43, -24, -4, 31, -45, 39, + -17, -12, 36, -45, 35, -10, -19, 40, -44, 30, -2, -26, 43, -42, 23, 6, + -32, 45, -39, 16, 13, -37, 45, -34, 9, 20, -41, 44, -29, 1, 27, -44, + 42, -22, -7, 33, -45, 38, -15, -14, 38, -45, 34, -8, -21, 41, -44, 28, + }, { + 32, -37, 15, 12, -35, 45, -39, 18, 9, -33, 45, -40, 21, 6, -30, 44, + -42, 24, 2, -28, 43, -43, 27, -1, -25, 42, -44, 30, -4, -22, 41, -45, + 32, -8, -19, 39, -45, 34, -11, -16, 38, -45, 36, -14, -13, 36, -45, 38, + -17, -10, 34, -45, 40, -20, -7, 31, -44, 41, -23, -3, 29, -44, 43, -26, + }, { + 32, -38, 19, 6, -29, 43, -44, 31, -9, -16, 36, -45, 40, -22, -2, 26, + -42, 45, -34, 12, 13, -34, 45, -41, 25, -1, -23, 40, -45, 36, -15, -10, + 32, -44, 43, -28, 4, 20, -39, 45, -38, 18, 7, -30, 43, -44, 30, -8, + -17, 37, -45, 39, -21, -3, 27, -42, 44, -33, 11, 14, -35, 45, -41, 24, + }, { + 32, -39, 23, -1, -21, 38, -45, 40, -25, 3, 19, -37, 45, -41, 27, -6, + -17, 36, -45, 42, -29, 8, 15, -34, 44, -43, 30, -10, -13, 33, -44, 44, + -32, 12, 11, -31, 43, -44, 34, -14, -9, 30, -43, 45, -35, 16, 7, -28, + 42, -45, 36, -18, -4, 26, -41, 45, -38, 20, 2, -24, 40, -45, 39, -22, + }, { + 32, -40, 27, -8, -13, 31, -43, 45, -38, 22, -2, -18, 35, -44, 44, -34, + 17, 3, -23, 38, -45, 42, -30, 12, 9, -28, 41, -45, 40, -26, 7, 14, + -32, 43, -45, 37, -21, 1, 19, -36, 44, -44, 34, -16, -4, 24, -39, 45, + -42, 30, -11, -10, 29, -41, 45, -39, 25, -6, -15, 33, -43, 45, -36, 20, + }, { + 32, -41, 30, -14, -4, 22, -36, 44, -44, 37, -23, 6, 13, -30, 41, -45, + 42, -31, 15, 3, -21, 36, -44, 45, -38, 24, -7, -12, 29, -40, 45, -42, + 32, -16, -2, 20, -35, 44, -45, 38, -25, 8, 11, -28, 40, -45, 43, -33, + 17, 1, -19, 34, -43, 45, -39, 26, -9, -10, 27, -39, 45, -43, 34, -18, + }, { + 32, -42, 34, -20, 4, 12, -27, 38, -44, 45, -39, 28, -13, -3, 19, -33, + 42, -45, 43, -34, 21, -6, -11, 26, -38, 44, -45, 39, -29, 14, 2, -18, + 32, -41, 45, -43, 35, -22, 7, 10, -25, 37, -44, 45, -40, 30, -15, -1, + 17, -31, 41, -45, 43, -36, 23, -8, -9, 24, -36, 44, -45, 40, -30, 16, + }, { + 32, -43, 36, -26, 13, 1, -15, 28, -38, 44, -45, 42, -35, 24, -11, -3, + 17, -30, 39, -44, 45, -41, 34, -22, 9, 6, -19, 31, -40, 45, -45, 40, + -32, 20, -7, -8, 21, -33, 41, -45, 44, -39, 30, -18, 4, 10, -23, 34, + -42, 45, -44, 38, -29, 16, -2, -12, 25, -36, 43, -45, 43, -37, 27, -14, + }, { + 32, -44, 39, -31, 21, -10, -2, 14, -25, 34, -41, 45, -45, 42, -36, 28, + -17, 6, 7, -18, 29, -37, 43, -45, 44, -40, 34, -24, 13, -1, -11, 22, + -32, 39, -44, 45, -43, 38, -30, 20, -9, -3, 15, -26, 35, -41, 45, -45, + 42, -36, 27, -16, 4, 8, -19, 30, -38, 43, -45, 44, -40, 33, -23, 12, + }, { + 32, -44, 41, -36, 29, -20, 11, -1, -9, 18, -27, 34, -40, 44, -45, 45, + -42, 37, -30, 22, -13, 3, 7, -16, 25, -33, 39, -43, 45, -45, 43, -38, + 32, -24, 15, -6, -4, 14, -23, 31, -38, 42, -45, 45, -43, 39, -34, 26, + -17, 8, 2, -12, 21, -30, 36, -41, 44, -45, 44, -40, 35, -28, 19, -10, + }, { + 32, -45, 43, -39, 35, -30, 23, -16, 9, -1, -7, 14, -21, 28, -34, 38, + -42, 44, -45, 45, -43, 40, -36, 31, -25, 18, -11, 3, 4, -12, 19, -26, + 32, -37, 41, -44, 45, -45, 44, -41, 38, -33, 27, -20, 13, -6, -2, 10, + -17, 24, -30, 36, -40, 43, -45, 45, -44, 42, -39, 34, -29, 22, -15, 8, + }, { + 32, -45, 44, -42, 40, -37, 34, -30, 25, -20, 15, -10, 4, 1, -7, 12, + -17, 22, -27, 31, -35, 38, -41, 43, -44, 45, -45, 45, -43, 41, -39, 36, + -32, 28, -23, 18, -13, 8, -2, -3, 9, -14, 19, -24, 29, -33, 36, -39, + 42, -44, 45, -45, 45, -44, 43, -40, 38, -34, 30, -26, 21, -16, 11, -6, + }, { + 32, -45, 45, -44, 43, -42, 41, -39, 38, -36, 34, -31, 29, -26, 23, -20, + 17, -14, 11, -8, 4, -1, -2, 6, -9, 12, -15, 18, -21, 24, -27, 30, + -32, 34, -36, 38, -40, 41, -43, 44, -44, 45, -45, 45, -45, 45, -44, 43, + -42, 40, -39, 37, -35, 33, -30, 28, -25, 22, -19, 16, -13, 10, -7, 3, + }, { + 32, -45, 45, -45, 45, -45, 45, -45, 44, -44, 44, -44, 43, -43, 43, -42, + 42, -41, 41, -40, 40, -39, 39, -38, 38, -37, 36, -36, 35, -34, 34, -33, + 32, -31, 30, -30, 29, -28, 27, -26, 25, -24, 23, -22, 21, -20, 19, -18, + 17, -16, 15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 4, -3, 2, -1, + } +}; diff --git a/src/shaders/icc.c b/src/shaders/icc.c new file mode 100644 index 0000000..6a16cfd --- /dev/null +++ b/src/shaders/icc.c @@ -0,0 +1,781 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include "shaders.h" + +#include <libplacebo/tone_mapping.h> +#include <libplacebo/shaders/icc.h> + +const struct pl_icc_params pl_icc_default_params = { PL_ICC_DEFAULTS }; + +#ifdef PL_HAVE_LCMS + +#include <lcms2.h> +#include <lcms2_plugin.h> + +struct icc_priv { + pl_log log; + pl_cache cache; // for backwards compatibility + cmsContext cms; + cmsHPROFILE profile; + cmsHPROFILE approx; // approximation profile + float a, b, scale; // approxmation tone curve parameters and scaling + cmsCIEXYZ black; + float gamma_stddev; + uint64_t lut_sig; +}; + +static void error_callback(cmsContext cms, cmsUInt32Number code, + const char *msg) +{ + pl_log log = cmsGetContextUserData(cms); + pl_err(log, "lcms2: [%d] %s", (int) code, msg); +} + +static void set_callback(void *priv, pl_cache_obj obj) +{ + pl_icc_object icc = priv; + icc->params.cache_save(icc->params.cache_priv, obj.key, obj.data, obj.size); +} + +static pl_cache_obj get_callback(void *priv, uint64_t key) +{ + pl_icc_object icc = priv; + int s_r = icc->params.size_r, s_g = icc->params.size_g, s_b = icc->params.size_b; + size_t data_size = s_r * s_g * s_b * sizeof(uint16_t[4]); + void *data = pl_alloc(NULL, data_size); + bool ok = icc->params.cache_load(icc->params.cache_priv, key, data, data_size); + if (!ok) { + pl_free(data); + return (pl_cache_obj) {0}; + } + + return (pl_cache_obj) { + .key = key, + .data = data, + .size = data_size, + .free = pl_free, + }; +} + +void pl_icc_close(pl_icc_object *picc) +{ + pl_icc_object icc = *picc; + if (!icc) + return; + + struct icc_priv *p = PL_PRIV(icc); + cmsCloseProfile(p->approx); + cmsCloseProfile(p->profile); + cmsDeleteContext(p->cms); + pl_cache_destroy(&p->cache); + pl_free_ptr((void **) picc); +} + +static bool detect_csp(pl_icc_object icc, struct pl_raw_primaries *prim, + float *out_gamma) +{ + struct icc_priv *p = PL_PRIV(icc); + cmsHTRANSFORM tf; + cmsHPROFILE xyz = cmsCreateXYZProfileTHR(p->cms); + if (!xyz) + return false; + + // We need to use an unadapted observer to get the raw values + cmsFloat64Number prev_adapt = cmsSetAdaptationStateTHR(p->cms, 0.0); + tf = cmsCreateTransformTHR(p->cms, p->profile, TYPE_RGB_8, xyz, TYPE_XYZ_DBL, + INTENT_ABSOLUTE_COLORIMETRIC, + /* Note: These flags mostly don't do anything + * anyway, but specify them regardless */ + cmsFLAGS_NOCACHE | + cmsFLAGS_NOOPTIMIZE); + cmsSetAdaptationStateTHR(p->cms, prev_adapt); + cmsCloseProfile(xyz); + if (!tf) + return false; + + enum { + RED, + GREEN, + BLUE, + WHITE, + BLACK, + GRAY, + RAMP, + }; + + static const uint8_t test[][3] = { + [RED] = { 0xFF, 0, 0 }, + [GREEN] = { 0, 0xFF, 0 }, + [BLUE] = { 0, 0, 0xFF }, + [WHITE] = { 0xFF, 0xFF, 0xFF }, + [BLACK] = { 0x00, 0x00, 0x00 }, + [GRAY] = { 0x80, 0x80, 0x80 }, + + // Grayscale ramp (excluding endpoints) +#define V(d) { d, d, d } + V(0x01), V(0x02), V(0x03), V(0x04), V(0x05), V(0x06), V(0x07), + V(0x08), V(0x09), V(0x0A), V(0x0B), V(0x0C), V(0x0D), V(0x0E), V(0x0F), + V(0x10), V(0x11), V(0x12), V(0x13), V(0x14), V(0x15), V(0x16), V(0x17), + V(0x18), V(0x19), V(0x1A), V(0x1B), V(0x1C), V(0x1D), V(0x1E), V(0x1F), + V(0x20), V(0x21), V(0x22), V(0x23), V(0x24), V(0x25), V(0x26), V(0x27), + V(0x28), V(0x29), V(0x2A), V(0x2B), V(0x2C), V(0x2D), V(0x2E), V(0x2F), + V(0x30), V(0x31), V(0x32), V(0x33), V(0x34), V(0x35), V(0x36), V(0x37), + V(0x38), V(0x39), V(0x3A), V(0x3B), V(0x3C), V(0x3D), V(0x3E), V(0x3F), + V(0x40), V(0x41), V(0x42), V(0x43), V(0x44), V(0x45), V(0x46), V(0x47), + V(0x48), V(0x49), V(0x4A), V(0x4B), V(0x4C), V(0x4D), V(0x4E), V(0x4F), + V(0x50), V(0x51), V(0x52), V(0x53), V(0x54), V(0x55), V(0x56), V(0x57), + V(0x58), V(0x59), V(0x5A), V(0x5B), V(0x5C), V(0x5D), V(0x5E), V(0x5F), + V(0x60), V(0x61), V(0x62), V(0x63), V(0x64), V(0x65), V(0x66), V(0x67), + V(0x68), V(0x69), V(0x6A), V(0x6B), V(0x6C), V(0x6D), V(0x6E), V(0x6F), + V(0x70), V(0x71), V(0x72), V(0x73), V(0x74), V(0x75), V(0x76), V(0x77), + V(0x78), V(0x79), V(0x7A), V(0x7B), V(0x7C), V(0x7D), V(0x7E), V(0x7F), + V(0x80), V(0x81), V(0x82), V(0x83), V(0x84), V(0x85), V(0x86), V(0x87), + V(0x88), V(0x89), V(0x8A), V(0x8B), V(0x8C), V(0x8D), V(0x8E), V(0x8F), + V(0x90), V(0x91), V(0x92), V(0x93), V(0x94), V(0x95), V(0x96), V(0x97), + V(0x98), V(0x99), V(0x9A), V(0x9B), V(0x9C), V(0x9D), V(0x9E), V(0x9F), + V(0xA0), V(0xA1), V(0xA2), V(0xA3), V(0xA4), V(0xA5), V(0xA6), V(0xA7), + V(0xA8), V(0xA9), V(0xAA), V(0xAB), V(0xAC), V(0xAD), V(0xAE), V(0xAF), + V(0xB0), V(0xB1), V(0xB2), V(0xB3), V(0xB4), V(0xB5), V(0xB6), V(0xB7), + V(0xB8), V(0xB9), V(0xBA), V(0xBB), V(0xBC), V(0xBD), V(0xBE), V(0xBF), + V(0xC0), V(0xC1), V(0xC2), V(0xC3), V(0xC4), V(0xC5), V(0xC6), V(0xC7), + V(0xC8), V(0xC9), V(0xCA), V(0xCB), V(0xCC), V(0xCD), V(0xCE), V(0xCF), + V(0xD0), V(0xD1), V(0xD2), V(0xD3), V(0xD4), V(0xD5), V(0xD6), V(0xD7), + V(0xD8), V(0xD9), V(0xDA), V(0xDB), V(0xDC), V(0xDD), V(0xDE), V(0xDF), + V(0xE0), V(0xE1), V(0xE2), V(0xE3), V(0xE4), V(0xE5), V(0xE6), V(0xE7), + V(0xE8), V(0xE9), V(0xEA), V(0xEB), V(0xEC), V(0xED), V(0xEE), V(0xEF), + V(0xF0), V(0xF1), V(0xF2), V(0xF3), V(0xF4), V(0xF5), V(0xF6), V(0xF7), + V(0xF8), V(0xF9), V(0xFA), V(0xFB), V(0xFC), V(0xFD), V(0xFE), +#undef V + }; + + cmsCIEXYZ dst[PL_ARRAY_SIZE(test)] = {0}; + cmsDoTransform(tf, test, dst, PL_ARRAY_SIZE(dst)); + cmsDeleteTransform(tf); + + // Read primaries from transformed RGBW values + prim->red = pl_cie_from_XYZ(dst[RED].X, dst[RED].Y, dst[RED].Z); + prim->green = pl_cie_from_XYZ(dst[GREEN].X, dst[GREEN].Y, dst[GREEN].Z); + prim->blue = pl_cie_from_XYZ(dst[BLUE].X, dst[BLUE].Y, dst[BLUE].Z); + prim->white = pl_cie_from_XYZ(dst[WHITE].X, dst[WHITE].Y, dst[WHITE].Z); + + // Rough estimate of overall gamma and starting point for curve black point + const float y_approx = dst[GRAY].Y ? log(dst[GRAY].Y) / log(0.5) : 1.0f; + const float kb = fmaxf(dst[BLACK].Y, 0.0f); + float b = powf(kb, 1 / y_approx); + + // Estimate mean and stddev of gamma (Welford's method) + float M = 0.0, S = 0.0; + int k = 1; + for (int i = RAMP; i < PL_ARRAY_SIZE(dst); i++) { // exclude primaries + if (dst[i].Y <= 0 || dst[i].Y >= 1) + continue; + float src = (1 - b) * (test[i][0] / 255.0) + b; + float y = log(dst[i].Y) / log(src); + float tmpM = M; + M += (y - tmpM) / k; + S += (y - tmpM) * (y - M); + k++; + + // Update estimate of black point according to current gamma estimate + b = powf(kb, 1 / M); + } + S = sqrt(S / (k - 1)); + + PL_INFO(p, "Detected profile approximation gamma %.3f", M); + if (S > 0.5) { + PL_WARN(p, "Detected profile gamma (%.3f) very far from pure power " + "response (stddev=%.1f), suspected unusual or broken profile. " + "Using anyway, but results may be poor.", M, S); + } else if (!(M > 0)) { + PL_ERR(p, "Arithmetic error in ICC profile gamma estimation? " + "Please open an issue"); + return false; + } + + *out_gamma = M; + p->gamma_stddev = S; + return true; +} + +static bool detect_contrast(pl_icc_object icc, struct pl_hdr_metadata *hdr, + struct pl_icc_params *params, float max_luma) +{ + struct icc_priv *p = PL_PRIV(icc); + cmsCIEXYZ *white = cmsReadTag(p->profile, cmsSigLuminanceTag); + enum pl_rendering_intent intent = params->intent; + /* LittleCMS refuses to detect an intent in absolute colorimetric intent, + * so fall back to relative colorimetric since we only care about the + * brightness value here */ + if (intent == PL_INTENT_ABSOLUTE_COLORIMETRIC) + intent = PL_INTENT_RELATIVE_COLORIMETRIC; + if (!cmsDetectDestinationBlackPoint(&p->black, p->profile, intent, 0)) { + /* + * v4 ICC profiles have a black point tag but only for + * perceptual/saturation intents. So we change the rendering intent + * to perceptual if we are provided a v4 ICC profile. + */ + if (cmsGetEncodedICCversion(p->profile) >= 0x4000000 && intent != PL_INTENT_PERCEPTUAL) { + params->intent = PL_INTENT_PERCEPTUAL; + return detect_contrast(icc, hdr, params, max_luma); + } + + PL_ERR(p, "Failed detecting ICC profile black point!"); + return false; + } + + if (white) { + PL_DEBUG(p, "Detected raw white point X=%.2f Y=%.2f Z=%.2f cd/m^2", + white->X, white->Y, white->Z); + } + PL_DEBUG(p, "Detected raw black point X=%.6f%% Y=%.6f%% Z=%.6f%%", + p->black.X * 100, p->black.Y * 100, p->black.Z * 100); + + if (max_luma <= 0) + max_luma = white ? white->Y : PL_COLOR_SDR_WHITE; + + hdr->max_luma = max_luma; + hdr->min_luma = p->black.Y * max_luma; + hdr->min_luma = PL_MAX(hdr->min_luma, 1e-6); // prevent true 0 + PL_INFO(p, "Using ICC contrast %.0f:1", hdr->max_luma / hdr->min_luma); + return true; +} + +static void infer_clut_size(struct pl_icc_object_t *icc) +{ + struct icc_priv *p = PL_PRIV(icc); + struct pl_icc_params *params = &icc->params; + if (params->size_r && params->size_g && params->size_b) { + PL_DEBUG(p, "Using fixed 3DLUT size: %dx%dx%d", + (int) params->size_r, (int) params->size_g, (int) params->size_b); + return; + } + +#define REQUIRE_SIZE(N) \ + params->size_r = PL_MAX(params->size_r, N); \ + params->size_g = PL_MAX(params->size_g, N); \ + params->size_b = PL_MAX(params->size_b, N) + + // Default size for sanity + REQUIRE_SIZE(9); + + // Ensure enough precision to track the (absolute) black point + if (p->black.Y > 1e-4) { + float black_rel = powf(p->black.Y, 1.0f / icc->gamma); + int min_size = 2 * (int) ceilf(1.0f / black_rel); + REQUIRE_SIZE(min_size); + } + + // Ensure enough precision to track the gamma curve + if (p->gamma_stddev > 1e-2) { + REQUIRE_SIZE(65); + } else if (p->gamma_stddev > 1e-3) { + REQUIRE_SIZE(33); + } else if (p->gamma_stddev > 1e-4) { + REQUIRE_SIZE(17); + } + + // Ensure enough precision to track any internal CLUTs + cmsPipeline *pipe = NULL; + switch (icc->params.intent) { + case PL_INTENT_SATURATION: + pipe = cmsReadTag(p->profile, cmsSigBToA2Tag); + if (pipe) + break; + // fall through + case PL_INTENT_RELATIVE_COLORIMETRIC: + case PL_INTENT_ABSOLUTE_COLORIMETRIC: + default: + pipe = cmsReadTag(p->profile, cmsSigBToA1Tag); + if (pipe) + break; + // fall through + case PL_INTENT_PERCEPTUAL: + pipe = cmsReadTag(p->profile, cmsSigBToA0Tag); + break; + } + + if (!pipe) { + switch (icc->params.intent) { + case PL_INTENT_SATURATION: + pipe = cmsReadTag(p->profile, cmsSigAToB2Tag); + if (pipe) + break; + // fall through + case PL_INTENT_RELATIVE_COLORIMETRIC: + case PL_INTENT_ABSOLUTE_COLORIMETRIC: + default: + pipe = cmsReadTag(p->profile, cmsSigAToB1Tag); + if (pipe) + break; + // fall through + case PL_INTENT_PERCEPTUAL: + pipe = cmsReadTag(p->profile, cmsSigAToB0Tag); + break; + } + } + + if (pipe) { + for (cmsStage *stage = cmsPipelineGetPtrToFirstStage(pipe); + stage; stage = cmsStageNext(stage)) + { + switch (cmsStageType(stage)) { + case cmsSigCLutElemType: ; + _cmsStageCLutData *data = cmsStageData(stage); + if (data->Params->nInputs != 3) + continue; + params->size_r = PL_MAX(params->size_r, data->Params->nSamples[0]); + params->size_g = PL_MAX(params->size_g, data->Params->nSamples[1]); + params->size_b = PL_MAX(params->size_b, data->Params->nSamples[2]); + break; + + default: + continue; + } + } + } + + // Clamp the output size to make sure profiles are not too large + params->size_r = PL_MIN(params->size_r, 129); + params->size_g = PL_MIN(params->size_g, 129); + params->size_b = PL_MIN(params->size_b, 129); + + // Constrain the total LUT size to roughly 1M entries + const size_t max_size = 1000000; + size_t total_size = params->size_r * params->size_g * params->size_b; + if (total_size > max_size) { + float factor = powf((float) max_size / total_size, 1/3.0f); + params->size_r = ceilf(factor * params->size_r); + params->size_g = ceilf(factor * params->size_g); + params->size_b = ceilf(factor * params->size_b); + } + + PL_INFO(p, "Chosen 3DLUT size: %dx%dx%d", + (int) params->size_r, (int) params->size_g, (int) params->size_b); +} + +static bool icc_init(struct pl_icc_object_t *icc) +{ + struct icc_priv *p = PL_PRIV(icc); + struct pl_icc_params *params = &icc->params; + if (params->intent < 0 || params->intent > PL_INTENT_ABSOLUTE_COLORIMETRIC) + params->intent = cmsGetHeaderRenderingIntent(p->profile); + + struct pl_raw_primaries *out_prim = &icc->csp.hdr.prim; + if (!detect_csp(icc, out_prim, &icc->gamma)) + return false; + if (!detect_contrast(icc, &icc->csp.hdr, params, params->max_luma)) + return false; + infer_clut_size(icc); + + const struct pl_raw_primaries *best = NULL; + for (enum pl_color_primaries prim = 1; prim < PL_COLOR_PRIM_COUNT; prim++) { + const struct pl_raw_primaries *raw = pl_raw_primaries_get(prim); + if (!icc->csp.primaries && pl_raw_primaries_similar(raw, out_prim)) { + icc->containing_primaries = prim; + icc->csp.primaries = prim; + best = raw; + break; + } + + if (pl_primaries_superset(raw, out_prim) && + (!best || pl_primaries_superset(best, raw))) + { + icc->containing_primaries = prim; + best = raw; + } + } + + if (!best) { + PL_WARN(p, "ICC profile too wide to handle, colors may be clipped!"); + icc->containing_primaries = PL_COLOR_PRIM_ACES_AP0; + best = pl_raw_primaries_get(icc->containing_primaries); + } + + // Create approximation profile. Use a tone-curve based on a BT.1886-style + // pure power curve, with an approximation gamma matched to the ICC + // profile. We stretch the luminance range *before* the input to the gamma + // function, to avoid numerical issues near the black point. (This removes + // the need for a separate linear section) + // + // Y = scale * (aX + b)^y, where Y = PCS luma and X = encoded value ([0-1]) + p->scale = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_NORM, icc->csp.hdr.max_luma); + p->b = powf(icc->csp.hdr.min_luma / icc->csp.hdr.max_luma, 1.0f / icc->gamma); + p->a = (1 - p->b); + cmsToneCurve *curve = cmsBuildParametricToneCurve(p->cms, 2, + (double[3]) { icc->gamma, p->a, p->b }); + if (!curve) + return false; + + cmsCIExyY wp_xyY = { best->white.x, best->white.y, 1.0 }; + cmsCIExyYTRIPLE prim_xyY = { + .Red = { best->red.x, best->red.y, 1.0 }, + .Green = { best->green.x, best->green.y, 1.0 }, + .Blue = { best->blue.x, best->blue.y, 1.0 }, + }; + + p->approx = cmsCreateRGBProfileTHR(p->cms, &wp_xyY, &prim_xyY, + (cmsToneCurve *[3]){ curve, curve, curve }); + cmsFreeToneCurve(curve); + if (!p->approx) + return false; + + // We need to create an ICC V2 profile because ICC V4 perceptual profiles + // have normalized semantics, but we want colorimetric mapping with BPC + cmsSetHeaderRenderingIntent(p->approx, icc->params.intent); + cmsSetProfileVersion(p->approx, 2.2); + + // Hash all parameters affecting the generated 3DLUT + p->lut_sig = CACHE_KEY_ICC_3DLUT; + pl_hash_merge(&p->lut_sig, icc->signature); + pl_hash_merge(&p->lut_sig, params->intent); + pl_hash_merge(&p->lut_sig, params->size_r); + pl_hash_merge(&p->lut_sig, params->size_g); + pl_hash_merge(&p->lut_sig, params->size_b); + pl_hash_merge(&p->lut_sig, params->force_bpc); + union { double d; uint64_t u; } v = { .d = icc->csp.hdr.max_luma }; + pl_hash_merge(&p->lut_sig, v.u); + // min luma depends only on the max luma and profile + + // Backwards compatibility with old caching API + if ((params->cache_save || params->cache_load) && !params->cache) { + p->cache = pl_cache_create(pl_cache_params( + .log = p->log, + .set = params->cache_save ? set_callback : NULL, + .get = params->cache_load ? get_callback : NULL, + .priv = icc, + )); + } + + return true; +} + +pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile, + const struct pl_icc_params *params) +{ + if (!profile->len) + return NULL; + + struct pl_icc_object_t *icc = pl_zalloc_obj(NULL, icc, struct icc_priv); + struct icc_priv *p = PL_PRIV(icc); + icc->params = params ? *params : pl_icc_default_params; + icc->signature = profile->signature; + p->log = log; + p->cms = cmsCreateContext(NULL, (void *) log); + if (!p->cms) { + PL_ERR(p, "Failed creating LittleCMS context!"); + goto error; + } + + cmsSetLogErrorHandlerTHR(p->cms, error_callback); + PL_INFO(p, "Opening ICC profile.."); + p->profile = cmsOpenProfileFromMemTHR(p->cms, profile->data, profile->len); + if (!p->profile) { + PL_ERR(p, "Failed opening ICC profile"); + goto error; + } + + if (cmsGetColorSpace(p->profile) != cmsSigRgbData) { + PL_ERR(p, "Invalid ICC profile: not RGB"); + goto error; + } + + if (!icc_init(icc)) + goto error; + + return icc; + +error: + pl_icc_close((pl_icc_object *) &icc); + return NULL; +} + +static bool icc_reopen(pl_icc_object kicc, const struct pl_icc_params *params) +{ + struct pl_icc_object_t *icc = (struct pl_icc_object_t *) kicc; + struct icc_priv *p = PL_PRIV(icc); + cmsCloseProfile(p->approx); + pl_cache_destroy(&p->cache); + + *icc = (struct pl_icc_object_t) { + .params = *params, + .signature = icc->signature, + }; + + *p = (struct icc_priv) { + .log = p->log, + .cms = p->cms, + .profile = p->profile, + }; + + PL_DEBUG(p, "Reinitializing ICC profile in-place"); + return icc_init(icc); +} + +bool pl_icc_update(pl_log log, pl_icc_object *out_icc, + const struct pl_icc_profile *profile, + const struct pl_icc_params *params) +{ + params = PL_DEF(params, &pl_icc_default_params); + pl_icc_object icc = *out_icc; + if (!icc && !profile) + return false; // nothing to update + + uint64_t sig = profile ? profile->signature : icc->signature; + if (!icc || icc->signature != sig) { + pl_assert(profile); + pl_icc_close(&icc); + *out_icc = icc = pl_icc_open(log, profile, params); + return icc != NULL; + } + + int size_r = PL_DEF(params->size_r, icc->params.size_r); + int size_g = PL_DEF(params->size_g, icc->params.size_g); + int size_b = PL_DEF(params->size_b, icc->params.size_b); + bool compat = params->intent == icc->params.intent && + params->max_luma == icc->params.max_luma && + params->force_bpc == icc->params.force_bpc && + size_r == icc->params.size_r && + size_g == icc->params.size_g && + size_b == icc->params.size_b; + if (compat) + return true; + + // ICC signature is the same but parameters are different, re-open in-place + if (!icc_reopen(icc, params)) { + pl_icc_close(&icc); + *out_icc = NULL; + return false; + } + + return true; +} + +static void fill_lut(void *datap, const struct sh_lut_params *params, bool decode) +{ + pl_icc_object icc = params->priv; + struct icc_priv *p = PL_PRIV(icc); + cmsHPROFILE srcp = decode ? p->profile : p->approx; + cmsHPROFILE dstp = decode ? p->approx : p->profile; + int s_r = params->width, s_g = params->height, s_b = params->depth; + + pl_clock_t start = pl_clock_now(); + cmsHTRANSFORM tf = cmsCreateTransformTHR(p->cms, srcp, TYPE_RGB_16, + dstp, TYPE_RGBA_16, + icc->params.intent, + cmsFLAGS_BLACKPOINTCOMPENSATION | + cmsFLAGS_NOCACHE | cmsFLAGS_NOOPTIMIZE); + if (!tf) + return; + + pl_clock_t after_transform = pl_clock_now(); + pl_log_cpu_time(p->log, start, after_transform, "creating ICC transform"); + + uint16_t *tmp = pl_alloc(NULL, s_r * 3 * sizeof(tmp[0])); + for (int b = 0; b < s_b; b++) { + for (int g = 0; g < s_g; g++) { + // Transform a single line of the output buffer + for (int r = 0; r < s_r; r++) { + tmp[r * 3 + 0] = r * 65535 / (s_r - 1); + tmp[r * 3 + 1] = g * 65535 / (s_g - 1); + tmp[r * 3 + 2] = b * 65535 / (s_b - 1); + } + + size_t offset = (b * s_g + g) * s_r * 4; + uint16_t *data = ((uint16_t *) datap) + offset; + cmsDoTransform(tf, tmp, data, s_r); + + if (!icc->params.force_bpc) + continue; + + // Fix the black point manually. Work-around for "improper" + // profiles, as black point compensation should already have + // taken care of this normally. + const uint16_t knee = 16u << 8; + if (tmp[0] >= knee || tmp[1] >= knee) + continue; + for (int r = 0; r < s_r; r++) { + uint16_t s = (2 * tmp[1] + tmp[2] + tmp[r * 3]) >> 2; + if (s >= knee) + break; + for (int c = 0; c < 3; c++) + data[r * 3 + c] = (s * data[r * 3 + c] + (knee - s) * s) >> 12; + } + } + } + + pl_log_cpu_time(p->log, after_transform, pl_clock_now(), "generating ICC 3DLUT"); + cmsDeleteTransform(tf); + pl_free(tmp); +} + +static void fill_decode(void *datap, const struct sh_lut_params *params) +{ + fill_lut(datap, params, true); +} + +static void fill_encode(void *datap, const struct sh_lut_params *params) +{ + fill_lut(datap, params, false); +} + +static pl_cache get_cache(pl_icc_object icc, pl_shader sh) +{ + struct icc_priv *p = PL_PRIV(icc); + return PL_DEF(icc->params.cache, PL_DEF(p->cache, SH_CACHE(sh))); +} + +void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj, + struct pl_color_space *out_csp) +{ + struct icc_priv *p = PL_PRIV(icc); + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR); + if (!fmt) { + SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!"); + return; + } + + ident_t lut = sh_lut(sh, sh_lut_params( + .object = lut_obj, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_TETRAHEDRAL, + .fmt = fmt, + .width = icc->params.size_r, + .height = icc->params.size_g, + .depth = icc->params.size_b, + .comps = 4, + .signature = p->lut_sig, + .fill = fill_decode, + .cache = get_cache(icc, sh), + .priv = (void *) icc, + )); + + if (!lut) { + SH_FAIL(sh, "pl_icc_decode: failed generating LUT object"); + return; + } + + // Y = scale * (aX + b)^y + sh_describe(sh, "ICC 3DLUT"); + GLSL("// pl_icc_decode \n" + "{ \n" + "color.rgb = "$"(color.rgb).rgb; \n" + "color.rgb = "$" * color.rgb + vec3("$"); \n" + "color.rgb = pow(color.rgb, vec3("$")); \n" + "color.rgb = "$" * color.rgb; \n" + "} \n", + lut, + SH_FLOAT(p->a), SH_FLOAT(p->b), + SH_FLOAT(icc->gamma), + SH_FLOAT(p->scale)); + + if (out_csp) { + *out_csp = (struct pl_color_space) { + .primaries = icc->containing_primaries, + .transfer = PL_COLOR_TRC_LINEAR, + .hdr = icc->csp.hdr, + }; + } +} + +void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj) +{ + struct icc_priv *p = PL_PRIV(icc); + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR); + if (!fmt) { + SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!"); + return; + } + + ident_t lut = sh_lut(sh, sh_lut_params( + .object = lut_obj, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_TETRAHEDRAL, + .fmt = fmt, + .width = icc->params.size_r, + .height = icc->params.size_g, + .depth = icc->params.size_b, + .comps = 4, + .signature = ~p->lut_sig, // avoid confusion with decoding LUTs + .fill = fill_encode, + .cache = get_cache(icc, sh), + .priv = (void *) icc, + )); + + if (!lut) { + SH_FAIL(sh, "pl_icc_encode: failed generating LUT object"); + return; + } + + // X = 1/a * (Y/scale)^(1/y) - b/a + sh_describe(sh, "ICC 3DLUT"); + GLSL("// pl_icc_encode \n" + "{ \n" + "color.rgb = max(color.rgb, 0.0); \n" + "color.rgb = 1.0/"$" * color.rgb; \n" + "color.rgb = pow(color.rgb, vec3("$")); \n" + "color.rgb = 1.0/"$" * color.rgb - "$"; \n" + "color.rgb = "$"(color.rgb).rgb; \n" + "} \n", + SH_FLOAT(p->scale), + SH_FLOAT(1.0f / icc->gamma), + SH_FLOAT(p->a), SH_FLOAT(p->b / p->a), + lut); +} + +#else // !PL_HAVE_LCMS + +void pl_icc_close(pl_icc_object *picc) {}; +pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile, + const struct pl_icc_params *pparams) +{ + pl_err(log, "libplacebo compiled without LittleCMS 2 support!"); + return NULL; +} + +bool pl_icc_update(pl_log log, pl_icc_object *obj, + const struct pl_icc_profile *profile, + const struct pl_icc_params *params) +{ + static bool warned; + if (!warned) { + pl_err(log, "libplacebo compiled without LittleCMS 2 support!"); + warned = true; + } + *obj = NULL; + return false; +} + +void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj, + struct pl_color_space *out_csp) +{ + pl_unreachable(); // can't get a pl_icc_object +} + +void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj) +{ + pl_unreachable(); +} + +#endif diff --git a/src/shaders/lut.c b/src/shaders/lut.c new file mode 100644 index 0000000..b0124fc --- /dev/null +++ b/src/shaders/lut.c @@ -0,0 +1,820 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include <ctype.h> + +#include "shaders.h" + +#include <libplacebo/shaders/lut.h> + +static inline bool isnumeric(char c) +{ + return (c >= '0' && c <= '9') || c == '-'; +} + +void pl_lut_free(struct pl_custom_lut **lut) +{ + pl_free_ptr(lut); +} + +struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *cstr, size_t cstr_len) +{ + struct pl_custom_lut *lut = pl_zalloc_ptr(NULL, lut); + pl_str str = (pl_str) { (uint8_t *) cstr, cstr_len }; + lut->signature = pl_str_hash(str); + int entries = 0; + + float min[3] = { 0.0, 0.0, 0.0 }; + float max[3] = { 1.0, 1.0, 1.0 }; + + // Parse header + while (str.len && !isnumeric(str.buf[0])) { + pl_str line = pl_str_strip(pl_str_getline(str, &str)); + if (!line.len) + continue; // skip empty line + + if (pl_str_eatstart0(&line, "TITLE")) { + pl_info(log, "Loading LUT: %.*s", PL_STR_FMT(pl_str_strip(line))); + continue; + } + + if (pl_str_eatstart0(&line, "LUT_3D_SIZE")) { + line = pl_str_strip(line); + int size; + if (!pl_str_parse_int(line, &size)) { + pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line)); + goto error; + } + if (size <= 0 || size > 1024) { + pl_err(log, "Invalid 3DLUT size: %dx%d%x", size, size, size); + goto error; + } + + lut->size[0] = lut->size[1] = lut->size[2] = size; + entries = size * size * size; + continue; + } + + if (pl_str_eatstart0(&line, "LUT_1D_SIZE")) { + line = pl_str_strip(line); + int size; + if (!pl_str_parse_int(line, &size)) { + pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line)); + goto error; + } + if (size <= 0 || size > 65536) { + pl_err(log, "Invalid 1DLUT size: %d", size); + goto error; + } + + lut->size[0] = size; + lut->size[1] = lut->size[2] = 0; + entries = size; + continue; + } + + if (pl_str_eatstart0(&line, "DOMAIN_MIN")) { + line = pl_str_strip(line); + if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[0]) || + !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[1]) || + !pl_str_parse_float(line, &min[2])) + { + pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line)); + goto error; + } + continue; + } + + if (pl_str_eatstart0(&line, "DOMAIN_MAX")) { + line = pl_str_strip(line); + if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[0]) || + !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[1]) || + !pl_str_parse_float(line, &max[2])) + { + pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line)); + goto error; + } + continue; + } + + if (pl_str_eatstart0(&line, "#")) { + pl_debug(log, "Unhandled .cube comment: %.*s", + PL_STR_FMT(pl_str_strip(line))); + continue; + } + + pl_warn(log, "Unhandled .cube line: %.*s", PL_STR_FMT(pl_str_strip(line))); + } + + if (!entries) { + pl_err(log, "Missing LUT size specification?"); + goto error; + } + + for (int i = 0; i < 3; i++) { + if (max[i] - min[i] < 1e-6) { + pl_err(log, "Invalid domain range: [%f, %f]", min[i], max[i]); + goto error; + } + } + + float *data = pl_alloc(lut, sizeof(float[3]) * entries); + lut->data = data; + + // Parse LUT body + pl_clock_t start = pl_clock_now(); + for (int n = 0; n < entries; n++) { + for (int c = 0; c < 3; c++) { + static const char * const digits = "0123456789.-+e"; + + // Extract valid digit sequence + size_t len = pl_strspn(str, digits); + pl_str entry = (pl_str) { str.buf, len }; + str.buf += len; + str.len -= len; + + if (!entry.len) { + if (!str.len) { + pl_err(log, "Failed parsing LUT: Unexpected EOF, expected " + "%d entries, got %d", entries * 3, n * 3 + c + 1); + } else { + pl_err(log, "Failed parsing LUT: Unexpected '%c', expected " + "digit", str.buf[0]); + } + goto error; + } + + float num; + if (!pl_str_parse_float(entry, &num)) { + pl_err(log, "Failed parsing float value '%.*s'", PL_STR_FMT(entry)); + goto error; + } + + // Rescale to range 0.0 - 1.0 + *data++ = (num - min[c]) / (max[c] - min[c]); + + // Skip whitespace between digits + str = pl_str_strip(str); + } + } + + str = pl_str_strip(str); + if (str.len) + pl_warn(log, "Extra data after LUT?... ignoring '%c'", str.buf[0]); + + pl_log_cpu_time(log, start, pl_clock_now(), "parsing .cube LUT"); + return lut; + +error: + pl_free(lut); + return NULL; +} + +static void fill_lut(void *datap, const struct sh_lut_params *params) +{ + const struct pl_custom_lut *lut = params->priv; + + int dim_r = params->width; + int dim_g = PL_DEF(params->height, 1); + int dim_b = PL_DEF(params->depth, 1); + + float *data = datap; + for (int b = 0; b < dim_b; b++) { + for (int g = 0; g < dim_g; g++) { + for (int r = 0; r < dim_r; r++) { + size_t offset = (b * dim_g + g) * dim_r + r; + const float *src = &lut->data[offset * 3]; + float *dst = &data[offset * 4]; + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = 0.0f; + } + } + } +} + +void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut, + pl_shader_obj *lut_state) +{ + if (!lut) + return; + + int dims; + if (lut->size[0] > 0 && lut->size[1] > 0 && lut->size[2] > 0) { + dims = 3; + } else if (lut->size[0] > 0 && !lut->size[1] && !lut->size[2]) { + dims = 1; + } else { + SH_FAIL(sh, "Invalid dimensions %dx%dx%d for pl_custom_lut, must be 1D " + "or 3D!", lut->size[0], lut->size[1], lut->size[2]); + return; + } + + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + ident_t fun = sh_lut(sh, sh_lut_params( + .object = lut_state, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_TETRAHEDRAL, + .width = lut->size[0], + .height = lut->size[1], + .depth = lut->size[2], + .comps = 4, // for better texel alignment + .signature = lut->signature, + .fill = fill_lut, + .priv = (void *) lut, + )); + + if (!fun) { + SH_FAIL(sh, "pl_shader_custom_lut: failed generating LUT object"); + return; + } + + GLSL("// pl_shader_custom_lut \n"); + + static const pl_matrix3x3 zero = {0}; + if (memcmp(&lut->shaper_in, &zero, sizeof(zero)) != 0) { + GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("shaper_in"), + .data = PL_TRANSPOSE_3X3(lut->shaper_in.m), + })); + } + + switch (dims) { + case 1: + sh_describe(sh, "custom 1DLUT"); + GLSL("color.rgb = vec3("$"(color.r).r, \n" + " "$"(color.g).g, \n" + " "$"(color.b).b); \n", + fun, fun, fun); + break; + case 3: + sh_describe(sh, "custom 3DLUT"); + GLSL("color.rgb = "$"(color.rgb).rgb; \n", fun); + break; + } + + if (memcmp(&lut->shaper_out, &zero, sizeof(zero)) != 0) { + GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("shaper_out"), + .data = PL_TRANSPOSE_3X3(lut->shaper_out.m), + })); + } +} + +// Defines a LUT position helper macro. This translates from an absolute texel +// scale (either in texels, or normalized to [0,1]) to the texture coordinate +// scale for the corresponding sample in a texture of dimension `lut_size`. +static ident_t texel_scale(pl_shader sh, int lut_size, bool normalized) +{ + const float base = 0.5f / lut_size; + const float end = 1.0f - 0.5f / lut_size; + const float scale = (end - base) / (normalized ? 1.0f : (lut_size - 1)); + + ident_t name = sh_fresh(sh, "LUT_SCALE"); + GLSLH("#define "$"(x) ("$" * (x) + "$") \n", + name, SH_FLOAT(scale), SH_FLOAT(base)); + return name; +} + +struct sh_lut_obj { + enum sh_lut_type type; + enum sh_lut_method method; + enum pl_var_type vartype; + pl_fmt fmt; + int width, height, depth, comps; + uint64_t signature; + bool error; // reset if params change + + // weights, depending on the lut type + pl_tex tex; + pl_str str; + void *data; +}; + +static void sh_lut_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_lut_obj *lut = ptr; + pl_tex_destroy(gpu, &lut->tex); + pl_free(lut->str.buf); + pl_free(lut->data); + + *lut = (struct sh_lut_obj) {0}; +} + +// Maximum number of floats to embed as a literal array (when using SH_LUT_AUTO) +#define SH_LUT_MAX_LITERAL_SOFT 64 +#define SH_LUT_MAX_LITERAL_HARD 256 + +ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params) +{ + pl_gpu gpu = SH_GPU(sh); + pl_cache_obj obj = { .key = CACHE_KEY_SH_LUT ^ params->signature }; + + const enum pl_var_type vartype = params->var_type; + pl_assert(vartype != PL_VAR_INVALID); + pl_assert(params->method == SH_LUT_NONE || vartype == PL_VAR_FLOAT); + pl_assert(params->width > 0 && params->height >= 0 && params->depth >= 0); + pl_assert(params->comps > 0); + pl_assert(!params->cache || params->signature); + + int sizes[] = { params->width, params->height, params->depth }; + int size = params->width * PL_DEF(params->height, 1) * PL_DEF(params->depth, 1); + int dims = params->depth ? 3 : params->height ? 2 : 1; + enum sh_lut_method method = params->method; + if (method == SH_LUT_TETRAHEDRAL && dims != 3) + method = SH_LUT_LINEAR; + if (method == SH_LUT_CUBIC && dims != 3) + method = SH_LUT_LINEAR; + + int texdim = 0; + uint32_t max_tex_dim[] = { + gpu ? gpu->limits.max_tex_1d_dim : 0, + gpu ? gpu->limits.max_tex_2d_dim : 0, + (gpu && gpu->glsl.version > 100) ? gpu->limits.max_tex_3d_dim : 0, + }; + + struct sh_lut_obj *lut = SH_OBJ(sh, params->object, PL_SHADER_OBJ_LUT, + struct sh_lut_obj, sh_lut_uninit); + + if (!lut) + return NULL_IDENT; + + bool update = params->update || lut->signature != params->signature || + vartype != lut->vartype || params->fmt != lut->fmt || + params->width != lut->width || params->height != lut->height || + params->depth != lut->depth || params->comps != lut->comps; + + if (lut->error && !update) + return NULL_IDENT; // suppress error spam until something changes + + // Try picking the right number of dimensions for the texture LUT. This + // allows e.g. falling back to 2D textures if 1D textures are unsupported. + for (int d = dims; d <= PL_ARRAY_SIZE(max_tex_dim); d++) { + // For a given dimension to be compatible, all coordinates need to be + // within the maximum texture size for that dimension + for (int i = 0; i < d; i++) { + if (sizes[i] > max_tex_dim[d - 1]) + goto next_dim; + } + + // All dimensions are compatible, so pick this texture dimension + texdim = d; + break; + +next_dim: ; // `continue` out of the inner loop + } + + static const enum pl_fmt_type fmt_type[PL_VAR_TYPE_COUNT] = { + [PL_VAR_SINT] = PL_FMT_SINT, + [PL_VAR_UINT] = PL_FMT_UINT, + [PL_VAR_FLOAT] = PL_FMT_FLOAT, + }; + + enum pl_fmt_caps texcaps = PL_FMT_CAP_SAMPLEABLE; + bool is_linear = method == SH_LUT_LINEAR || method == SH_LUT_CUBIC; + if (is_linear) + texcaps |= PL_FMT_CAP_LINEAR; + + pl_fmt texfmt = params->fmt; + if (texfmt) { + bool ok; + switch (texfmt->type) { + case PL_FMT_SINT: ok = vartype == PL_VAR_SINT; break; + case PL_FMT_UINT: ok = vartype == PL_VAR_UINT; break; + default: ok = vartype == PL_VAR_FLOAT; break; + } + + if (!ok) { + PL_ERR(sh, "Specified texture format '%s' does not match LUT " + "data type!", texfmt->name); + goto error; + } + + if (~texfmt->caps & texcaps) { + PL_ERR(sh, "Specified texture format '%s' does not match " + "required capabilities 0x%x!\n", texfmt->name, texcaps); + goto error; + } + } + + if (texdim && !texfmt) { + texfmt = pl_find_fmt(gpu, fmt_type[vartype], params->comps, + vartype == PL_VAR_FLOAT ? 16 : 32, + pl_var_type_size(vartype) * 8, + texcaps); + } + + enum sh_lut_type type = params->lut_type; + + // The linear sampling code currently only supports 1D linear interpolation + if (is_linear && dims > 1) { + if (texfmt) { + type = SH_LUT_TEXTURE; + } else { + PL_ERR(sh, "Can't emulate linear LUTs for 2D/3D LUTs and no " + "texture support available!"); + goto error; + } + } + + bool can_uniform = gpu && gpu->limits.max_variable_comps >= size * params->comps; + bool can_literal = sh_glsl(sh).version > 110; // needed for literal arrays + can_literal &= size <= SH_LUT_MAX_LITERAL_HARD && !params->dynamic; + + // Deselect unsupported methods + if (type == SH_LUT_UNIFORM && !can_uniform) + type = SH_LUT_AUTO; + if (type == SH_LUT_LITERAL && !can_literal) + type = SH_LUT_AUTO; + if (type == SH_LUT_TEXTURE && !texfmt) + type = SH_LUT_AUTO; + + // Sorted by priority + if (!type && can_literal && !method && size <= SH_LUT_MAX_LITERAL_SOFT) + type = SH_LUT_LITERAL; + if (!type && texfmt) + type = SH_LUT_TEXTURE; + if (!type && can_uniform) + type = SH_LUT_UNIFORM; + if (!type && can_literal) + type = SH_LUT_LITERAL; + + if (!type) { + PL_ERR(sh, "Can't generate LUT: no compatible methods!"); + goto error; + } + + // Reinitialize the existing LUT if needed + update |= type != lut->type; + update |= method != lut->method; + + if (update) { + if (params->dynamic) + pl_log_level_cap(sh->log, PL_LOG_TRACE); + + size_t el_size = params->comps * pl_var_type_size(vartype); + if (type == SH_LUT_TEXTURE) + el_size = texfmt->texel_size; + + size_t buf_size = size * el_size; + if (pl_cache_get(params->cache, &obj) && obj.size == buf_size) { + PL_DEBUG(sh, "Re-using cached LUT (0x%"PRIx64") with size %zu", + obj.key, obj.size); + } else { + PL_DEBUG(sh, "LUT invalidated, regenerating.."); + pl_cache_obj_resize(NULL, &obj, buf_size); + pl_clock_t start = pl_clock_now(); + params->fill(obj.data, params); + pl_log_cpu_time(sh->log, start, pl_clock_now(), "generating shader LUT"); + } + + pl_assert(obj.data && obj.size); + if (params->dynamic) + pl_log_level_cap(sh->log, PL_LOG_NONE); + + switch (type) { + case SH_LUT_TEXTURE: { + if (!texdim) { + PL_ERR(sh, "Texture LUT exceeds texture dimensions!"); + goto error; + } + + if (!texfmt) { + PL_ERR(sh, "Found no compatible texture format for LUT!"); + goto error; + } + + struct pl_tex_params tex_params = { + .w = params->width, + .h = PL_DEF(params->height, texdim >= 2 ? 1 : 0), + .d = PL_DEF(params->depth, texdim >= 3 ? 1 : 0), + .format = texfmt, + .sampleable = true, + .host_writable = params->dynamic, + .initial_data = params->dynamic ? NULL : obj.data, + .debug_tag = params->debug_tag, + }; + + bool ok; + if (params->dynamic) { + ok = pl_tex_recreate(gpu, &lut->tex, &tex_params); + if (ok) { + ok = pl_tex_upload(gpu, pl_tex_transfer_params( + .tex = lut->tex, + .ptr = obj.data, + )); + } + } else { + // Can't use pl_tex_recreate because of `initial_data` + pl_tex_destroy(gpu, &lut->tex); + lut->tex = pl_tex_create(gpu, &tex_params); + ok = lut->tex; + } + + if (!ok) { + PL_ERR(sh, "Failed creating LUT texture!"); + goto error; + } + break; + } + + case SH_LUT_UNIFORM: + pl_free(lut->data); + lut->data = pl_memdup(NULL, obj.data, obj.size); + break; + + case SH_LUT_LITERAL: { + lut->str.len = 0; + static const char prefix[PL_VAR_TYPE_COUNT] = { + [PL_VAR_SINT] = 'i', + [PL_VAR_UINT] = 'u', + [PL_VAR_FLOAT] = ' ', + }; + + for (int i = 0; i < size * params->comps; i += params->comps) { + if (i > 0) + pl_str_append_asprintf_c(lut, &lut->str, ","); + if (params->comps > 1) { + pl_str_append_asprintf_c(lut, &lut->str, "%cvec%d(", + prefix[vartype], params->comps); + } + for (int c = 0; c < params->comps; c++) { + switch (vartype) { + case PL_VAR_FLOAT: + pl_str_append_asprintf_c(lut, &lut->str, "%s%f", + c > 0 ? "," : "", + ((float *) obj.data)[i+c]); + break; + case PL_VAR_UINT: + pl_str_append_asprintf_c(lut, &lut->str, "%s%u", + c > 0 ? "," : "", + ((unsigned int *) obj.data)[i+c]); + break; + case PL_VAR_SINT: + pl_str_append_asprintf_c(lut, &lut->str, "%s%d", + c > 0 ? "," : "", + ((int *) obj.data)[i+c]); + break; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + } + if (params->comps > 1) + pl_str_append_asprintf_c(lut, &lut->str, ")"); + } + break; + } + + case SH_LUT_AUTO: + pl_unreachable(); + } + + lut->type = type; + lut->method = method; + lut->vartype = vartype; + lut->fmt = params->fmt; + lut->width = params->width; + lut->height = params->height; + lut->depth = params->depth; + lut->comps = params->comps; + lut->signature = params->signature; + pl_cache_set(params->cache, &obj); + } + + // Done updating, generate the GLSL + ident_t name = sh_fresh(sh, "lut"); + ident_t arr_name = NULL_IDENT; + + static const char * const swizzles[] = {"x", "xy", "xyz", "xyzw"}; + static const char * const vartypes[PL_VAR_TYPE_COUNT][4] = { + [PL_VAR_SINT] = { "int", "ivec2", "ivec3", "ivec4" }, + [PL_VAR_UINT] = { "uint", "uvec2", "uvec3", "uvec4" }, + [PL_VAR_FLOAT] = { "float", "vec2", "vec3", "vec4" }, + }; + + switch (type) { + case SH_LUT_TEXTURE: { + assert(texdim); + ident_t tex = sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = "weights", + .type = PL_DESC_SAMPLED_TEX, + }, + .binding = { + .object = lut->tex, + .sample_mode = is_linear ? PL_TEX_SAMPLE_LINEAR + : PL_TEX_SAMPLE_NEAREST, + } + }); + + if (is_linear) { + ident_t pos_macros[PL_ARRAY_SIZE(sizes)] = {0}; + for (int i = 0; i < dims; i++) + pos_macros[i] = texel_scale(sh, sizes[i], true); + + GLSLH("#define "$"(pos) (textureLod("$", %s(\\\n", + name, tex, vartypes[PL_VAR_FLOAT][texdim - 1]); + + for (int i = 0; i < texdim; i++) { + char sep = i == 0 ? ' ' : ','; + if (pos_macros[i]) { + if (dims > 1) { + GLSLH(" %c"$"(%s(pos).%c)\\\n", sep, pos_macros[i], + vartypes[PL_VAR_FLOAT][dims - 1], "xyzw"[i]); + } else { + GLSLH(" %c"$"(float(pos))\\\n", sep, pos_macros[i]); + } + } else { + GLSLH(" %c%f\\\n", sep, 0.5); + } + } + GLSLH(" ), 0.0).%s)\n", swizzles[params->comps - 1]); + } else { + GLSLH("#define "$"(pos) (texelFetch("$", %s(pos", + name, tex, vartypes[PL_VAR_SINT][texdim - 1]); + + // Fill up extra components of the index + for (int i = dims; i < texdim; i++) + GLSLH(", 0"); + + GLSLH("), 0).%s)\n", swizzles[params->comps - 1]); + } + break; + } + + case SH_LUT_UNIFORM: + arr_name = sh_var(sh, (struct pl_shader_var) { + .var = { + .name = "weights", + .type = vartype, + .dim_v = params->comps, + .dim_m = 1, + .dim_a = size, + }, + .data = lut->data, + }); + break; + + case SH_LUT_LITERAL: + arr_name = sh_fresh(sh, "weights"); + GLSLH("const %s "$"[%d] = %s[](\n ", + vartypes[vartype][params->comps - 1], arr_name, size, + vartypes[vartype][params->comps - 1]); + sh_append_str(sh, SH_BUF_HEADER, lut->str); + GLSLH(");\n"); + break; + + case SH_LUT_AUTO: + pl_unreachable(); + } + + if (arr_name) { + GLSLH("#define "$"(pos) ("$"[int((pos)%s)\\\n", + name, arr_name, dims > 1 ? "[0]" : ""); + int shift = params->width; + for (int i = 1; i < dims; i++) { + GLSLH(" + %d * int((pos)[%d])\\\n", shift, i); + shift *= sizes[i]; + } + GLSLH(" ])\n"); + + if (is_linear) { + pl_assert(dims == 1); + pl_assert(vartype == PL_VAR_FLOAT); + ident_t arr_lut = name; + name = sh_fresh(sh, "lut_lin"); + GLSLH("%s "$"(float fpos) { \n" + " fpos = clamp(fpos, 0.0, 1.0) * %d.0; \n" + " float fbase = floor(fpos); \n" + " float fceil = ceil(fpos); \n" + " float fcoord = fpos - fbase; \n" + " return mix("$"(fbase), "$"(fceil), fcoord); \n" + "} \n", + vartypes[PL_VAR_FLOAT][params->comps - 1], name, + size - 1, + arr_lut, arr_lut); + } + } + + if (method == SH_LUT_CUBIC && dims == 3) { + ident_t lin_lut = name; + name = sh_fresh(sh, "lut_tricubic"); + GLSLH("%s "$"(vec3 pos) { \n" + " vec3 scale = vec3(%d.0, %d.0, %d.0); \n" + " vec3 scale_inv = 1.0 / scale; \n" + " pos *= scale; \n" + " vec3 fpos = fract(pos); \n" + " vec3 base = pos - fpos; \n" + " vec3 fpos2 = fpos * fpos; \n" + " vec3 inv = 1.0 - fpos; \n" + " vec3 inv2 = inv * inv; \n" + " vec3 w0 = 1.0/6.0 * inv2 * inv; \n" + " vec3 w1 = 2.0/3.0 - 0.5 * fpos2 * (2.0 - fpos); \n" + " vec3 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n" + " vec3 w3 = 1.0/6.0 * fpos2 * fpos; \n" + " vec3 g0 = w0 + w1; \n" + " vec3 g1 = w2 + w3; \n" + " vec3 h0 = scale_inv * ((w1 / g0) - 1.0 + base); \n" + " vec3 h1 = scale_inv * ((w3 / g1) + 1.0 + base); \n" + " %s c000, c001, c010, c011, c100, c101, c110, c111; \n" + " c000 = "$"(h0); \n" + " c100 = "$"(vec3(h1.x, h0.y, h0.z)); \n" + " c000 = mix(c100, c000, g0.x); \n" + " c010 = "$"(vec3(h0.x, h1.y, h0.z)); \n" + " c110 = "$"(vec3(h1.x, h1.y, h0.z)); \n" + " c010 = mix(c110, c010, g0.x); \n" + " c000 = mix(c010, c000, g0.y); \n" + " c001 = "$"(vec3(h0.x, h0.y, h1.z)); \n" + " c101 = "$"(vec3(h1.x, h0.y, h1.z)); \n" + " c001 = mix(c101, c001, g0.x); \n" + " c011 = "$"(vec3(h0.x, h1.y, h1.z)); \n" + " c111 = "$"(h1); \n" + " c011 = mix(c111, c011, g0.x); \n" + " c001 = mix(c011, c001, g0.y); \n" + " return mix(c001, c000, g0.z); \n" + "} \n", + vartypes[PL_VAR_FLOAT][params->comps - 1], name, + sizes[0] - 1, sizes[1] - 1, sizes[2] - 1, + vartypes[PL_VAR_FLOAT][params->comps - 1], + lin_lut, lin_lut, lin_lut, lin_lut, + lin_lut, lin_lut, lin_lut, lin_lut); + } + + if (method == SH_LUT_TETRAHEDRAL) { + ident_t int_lut = name; + name = sh_fresh(sh, "lut_barycentric"); + GLSLH("%s "$"(vec3 pos) { \n" + // Compute bounding vertices and fractional part + " pos = clamp(pos, 0.0, 1.0) * vec3(%d.0, %d.0, %d.0); \n" + " vec3 base = floor(pos); \n" + " vec3 fpart = pos - base; \n" + // v0 and v3 are always 'black' and 'white', respectively + // v1 and v2 are the closest RGB and CMY vertices, respectively + " ivec3 v0 = ivec3(base), v3 = ivec3(ceil(pos)); \n" + " ivec3 v1 = v0, v2 = v3; \n" + // Table of boolean checks to simplify following math + " bvec3 c = greaterThanEqual(fpart.xyz, fpart.yzx); \n" + " bool c_xy = c.x, c_yx = !c.x, \n" + " c_yz = c.y, c_zy = !c.y, \n" + " c_zx = c.z, c_xz = !c.z; \n" + " vec3 s = fpart.xyz; \n" + " bool cond; \n", + vartypes[PL_VAR_FLOAT][params->comps - 1], name, + sizes[0] - 1, sizes[1] - 1, sizes[2] - 1); + + // Subdivision of the cube into six congruent tetrahedras + // + // For each tetrahedron, test if the point is inside, and if so, update + // the edge vertices. We test all six, even though only one case will + // ever be true, because this avoids branches. + static const char *indices[] = { "xyz", "xzy", "zxy", "zyx", "yzx", "yxz"}; + for (int i = 0; i < PL_ARRAY_SIZE(indices); i++) { + const char x = indices[i][0], y = indices[i][1], z = indices[i][2]; + GLSLH("cond = c_%c%c && c_%c%c; \n" + "s = cond ? fpart.%c%c%c : s; \n" + "v1.%c = cond ? v3.%c : v1.%c; \n" + "v2.%c = cond ? v0.%c : v2.%c; \n", + x, y, y, z, + x, y, z, + x, x, x, + z, z, z); + } + + // Interpolate in barycentric coordinates, with four texel fetches + GLSLH(" return (1.0 - s.x) * "$"(v0) + \n" + " (s.x - s.y) * "$"(v1) + \n" + " (s.y - s.z) * "$"(v2) + \n" + " (s.z) * "$"(v3); \n" + "} \n", + int_lut, int_lut, int_lut, int_lut); + } + + lut->error = false; + pl_cache_obj_free(&obj); + pl_assert(name); + return name; + +error: + lut->error = true; + pl_cache_obj_free(&obj); + return NULL_IDENT; +} diff --git a/src/shaders/meson.build b/src/shaders/meson.build new file mode 100644 index 0000000..746747c --- /dev/null +++ b/src/shaders/meson.build @@ -0,0 +1,23 @@ +shader_sources = [ + 'colorspace.c', + 'custom.c', + 'custom_mpv.c', + 'deinterlacing.c', + 'dithering.c', + 'film_grain.c', + 'film_grain_av1.c', + 'film_grain_h274.c', + 'icc.c', + 'lut.c', + 'sampling.c', +] + +foreach s : shader_sources + sources += custom_target(s, + command: glsl_preproc, + depend_files: glsl_deps, + env: python_env, + input: s, + output: s, + ) +endforeach diff --git a/src/shaders/sampling.c b/src/shaders/sampling.c new file mode 100644 index 0000000..fc10f80 --- /dev/null +++ b/src/shaders/sampling.c @@ -0,0 +1,1198 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include "shaders.h" + +#include <libplacebo/colorspace.h> +#include <libplacebo/shaders/sampling.h> + +const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS }; + +static inline struct pl_tex_params src_params(const struct pl_sample_src *src) +{ + if (src->tex) + return src->tex->params; + + return (struct pl_tex_params) { + .w = src->tex_w, + .h = src->tex_h, + }; +} + +enum filter { + NEAREST = PL_TEX_SAMPLE_NEAREST, + LINEAR = PL_TEX_SAMPLE_LINEAR, + BEST, + FASTEST, +}; + +// Helper function to compute the src/dst sizes and upscaling ratios +static bool setup_src(pl_shader sh, const struct pl_sample_src *src, + ident_t *src_tex, ident_t *pos, ident_t *pt, + float *ratio_x, float *ratio_y, uint8_t *comp_mask, + float *scale, bool resizeable, + enum filter filter) +{ + enum pl_shader_sig sig; + float src_w, src_h; + enum pl_tex_sample_mode sample_mode; + if (src->tex) { + pl_fmt fmt = src->tex->params.format; + bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR; + pl_assert(pl_tex_params_dimension(src->tex->params) == 2); + sig = PL_SHADER_SIG_NONE; + src_w = pl_rect_w(src->rect); + src_h = pl_rect_h(src->rect); + switch (filter) { + case FASTEST: + case NEAREST: + sample_mode = PL_TEX_SAMPLE_NEAREST; + break; + case LINEAR: + if (!can_linear) { + SH_FAIL(sh, "Trying to use a shader that requires linear " + "sampling with a texture whose format (%s) does not " + "support PL_FMT_CAP_LINEAR", fmt->name); + return false; + } + sample_mode = PL_TEX_SAMPLE_LINEAR; + break; + case BEST: + sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST; + break; + } + } else { + pl_assert(src->tex_w && src->tex_h); + sig = PL_SHADER_SIG_SAMPLER; + src_w = src->sampled_w; + src_h = src->sampled_h; + if (filter == BEST || filter == FASTEST) { + sample_mode = src->mode; + } else { + sample_mode = (enum pl_tex_sample_mode) filter; + if (sample_mode != src->mode) { + SH_FAIL(sh, "Trying to use a shader that requires a different " + "filter mode than the external sampler."); + return false; + } + } + } + + src_w = PL_DEF(src_w, src_params(src).w); + src_h = PL_DEF(src_h, src_params(src).h); + pl_assert(src_w && src_h); + + int out_w = PL_DEF(src->new_w, roundf(fabs(src_w))); + int out_h = PL_DEF(src->new_h, roundf(fabs(src_h))); + pl_assert(out_w && out_h); + + if (ratio_x) + *ratio_x = out_w / fabs(src_w); + if (ratio_y) + *ratio_y = out_h / fabs(src_h); + if (scale) + *scale = PL_DEF(src->scale, 1.0); + + if (comp_mask) { + uint8_t tex_mask = 0x0Fu; + if (src->tex) { + // Mask containing only the number of components in the texture + tex_mask = (1 << src->tex->params.format->num_components) - 1; + } + + uint8_t src_mask = src->component_mask; + if (!src_mask) + src_mask = (1 << PL_DEF(src->components, 4)) - 1; + + // Only actually sample components that are both requested and + // available in the texture being sampled + *comp_mask = tex_mask & src_mask; + } + + if (resizeable) + out_w = out_h = 0; + if (!sh_require(sh, sig, out_w, out_h)) + return false; + + if (src->tex) { + pl_rect2df rect = { + .x0 = src->rect.x0, + .y0 = src->rect.y0, + .x1 = src->rect.x0 + src_w, + .y1 = src->rect.y0 + src_h, + }; + + *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode, + "src_tex", &rect, pos, pt); + } else { + if (pt) { + float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h; + if (src->sampler == PL_SAMPLER_RECT) + sx = sy = 1.0; + + *pt = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tex_pt"), + .data = &(float[2]) { sx, sy }, + }); + } + + sh->sampler_type = src->sampler; + + pl_assert(src->format); + switch (src->format) { + case PL_FMT_UNKNOWN: + case PL_FMT_FLOAT: + case PL_FMT_UNORM: + case PL_FMT_SNORM: sh->sampler_prefix = ' '; break; + case PL_FMT_UINT: sh->sampler_prefix = 'u'; break; + case PL_FMT_SINT: sh->sampler_prefix = 's'; break; + case PL_FMT_TYPE_COUNT: + pl_unreachable(); + } + + *src_tex = sh_fresh(sh, "src_tex"); + *pos = sh_fresh(sh, "pos"); + + GLSLH("#define "$" src_tex \n" + "#define "$" pos \n", + *src_tex, *pos); + } + + return true; +} + +void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src, + const struct pl_deband_params *params) +{ + float scale; + ident_t tex, pos, pt; + uint8_t mask; + if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, LINEAR)) + return; + + params = PL_DEF(params, &pl_deband_default_params); + sh_describe(sh, "debanding"); + GLSL("vec4 color; \n" + "// pl_shader_deband \n" + "{ \n" + "vec2 pos = "$", pt = "$"; \n" + "color = textureLod("$", pos, 0.0);\n", + pos, pt, tex); + + mask &= ~0x8u; // ignore alpha channel + uint8_t num_comps = sh_num_comps(mask); + const char *swiz = sh_swizzle(mask); + pl_assert(num_comps <= 3); + if (!num_comps) { + GLSL("color *= "$"; \n" + "} \n", + SH_FLOAT(scale)); + return; + } + + GLSL("#define GET(X, Y) \\\n" + " (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s) \n" + "#define T %s \n", + tex, swiz, sh_float_type(mask)); + + ident_t prng = sh_prng(sh, true, NULL); + GLSL("T avg, diff, bound; \n" + "T res = color.%s; \n" + "vec2 d; \n", + swiz); + + if (params->iterations > 0) { + ident_t radius = sh_const_float(sh, "radius", params->radius); + ident_t threshold = sh_const_float(sh, "threshold", + params->threshold / (1000 * scale)); + + // For each iteration, compute the average at a given distance and + // pick it instead of the color if the difference is below the threshold. + for (int i = 1; i <= params->iterations; i++) { + GLSL(// Compute a random angle and distance + "d = "$".xy * vec2(%d.0 * "$", %f); \n" + "d = d.x * vec2(cos(d.y), sin(d.y)); \n" + // Sample at quarter-turn intervals around the source pixel + "avg = T(0.0); \n" + "avg += GET(+d.x, +d.y); \n" + "avg += GET(-d.x, +d.y); \n" + "avg += GET(-d.x, -d.y); \n" + "avg += GET(+d.x, -d.y); \n" + "avg *= 0.25; \n" + // Compare the (normalized) average against the pixel + "diff = abs(res - avg); \n" + "bound = T("$" / %d.0); \n", + prng, i, radius, M_PI * 2, + threshold, i); + + if (num_comps > 1) { + GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n"); + } else { + GLSL("res = mix(avg, res, diff > bound); \n"); + } + } + } + + // Add some random noise to smooth out residual differences + if (params->grain > 0) { + // Avoid adding grain near true black + GLSL("bound = T(\n"); + for (int c = 0; c < num_comps; c++) { + GLSL("%c"$, c > 0 ? ',' : ' ', + SH_FLOAT(params->grain_neutral[c] / scale)); + } + GLSL("); \n" + "T strength = min(abs(res - bound), "$"); \n" + "res += strength * (T("$") - T(0.5)); \n", + SH_FLOAT(params->grain / (1000.0 * scale)), prng); + } + + GLSL("color.%s = res; \n" + "color *= "$"; \n" + "#undef T \n" + "#undef GET \n" + "} \n", + swiz, SH_FLOAT(scale)); +} + +bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src) +{ + float scale; + ident_t tex, pos; + if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST)) + return false; + + GLSL("// pl_shader_sample_direct \n" + "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n", + SH_FLOAT(scale), tex, pos); + return true; +} + +bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src) +{ + float scale; + ident_t tex, pos; + if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, NEAREST)) + return false; + + sh_describe(sh, "nearest"); + GLSL("// pl_shader_sample_nearest \n" + "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n", + SH_FLOAT(scale), tex, pos); + return true; +} + +bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src) +{ + float scale; + ident_t tex, pos; + if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR)) + return false; + + sh_describe(sh, "bilinear"); + GLSL("// pl_shader_sample_bilinear \n" + "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n", + SH_FLOAT(scale), tex, pos); + return true; +} + +bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + if (rx < 1 || ry < 1) { + PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This " + "will most likely result in nasty aliasing!"); + } + + // Explanation of how bicubic scaling with only 4 texel fetches is done: + // http://www.mate.tue.nl/mate/pdfs/10318.pdf + // 'Efficient GPU-Based Texture Interpolation using Uniform B-Splines' + + sh_describe(sh, "bicubic"); +#pragma GLSL /* pl_shader_sample_bicubic */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 frac = fract(pos * size + vec2(0.5)); \ + vec2 frac2 = frac * frac; \ + vec2 inv = vec2(1.0) - frac; \ + vec2 inv2 = inv * inv; \ + /* compute filter weights directly */ \ + vec2 w0 = 1.0/6.0 * inv2 * inv; \ + vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \ + vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \ + vec2 w3 = 1.0/6.0 * frac2 * frac; \ + vec4 g = vec4(w0 + w1, w2 + w3); \ + vec4 h = vec4(w1, w3) / g + inv.xyxy; \ + h.xy -= vec2(2.0); \ + /* sample four corners, then interpolate */ \ + vec4 p = pos.xyxy + $pt.xyxy * h; \ + vec4 c00 = textureLod($tex, p.xy, 0.0); \ + vec4 c01 = textureLod($tex, p.xw, 0.0); \ + vec4 c0 = mix(c01, c00, g.y); \ + vec4 c10 = textureLod($tex, p.zy, 0.0); \ + vec4 c11 = textureLod($tex, p.zw, 0.0); \ + vec4 c1 = mix(c11, c10, g.y); \ + color = ${float:scale} * mix(c1, c0, g.x); \ + } + + return true; +} + +bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + if (rx < 1 || ry < 1) { + PL_TRACE(sh, "Using fast hermite sampling when downscaling. This " + "will most likely result in nasty aliasing!"); + } + + sh_describe(sh, "hermite"); +#pragma GLSL /* pl_shader_sample_hermite */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 frac = fract(pos * size + vec2(0.5)); \ + pos += $pt * (smoothstep(0.0, 1.0, frac) - frac); \ + color = ${float:scale} * textureLod($tex, pos, 0.0); \ + } + + return true; +} + +bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + if (rx < 1 || ry < 1) { + PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This " + "will most likely result in nasty aliasing!"); + } + + sh_describe(sh, "gaussian"); +#pragma GLSL /* pl_shader_sample_gaussian */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 off = -fract(pos * size + vec2(0.5)); \ + vec2 off2 = -2.0 * off * off; \ + /* compute gaussian weights */ \ + vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0)); \ + vec2 w1 = exp(off2); \ + vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0)); \ + vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0)); \ + vec4 g = vec4(w0 + w1, w2 + w3); \ + vec4 h = vec4(w1, w3) / g; \ + h.xy -= vec2(1.0); \ + h.zw += vec2(1.0); \ + g.xy /= g.xy + g.zw; /* explicitly normalize */ \ + /* sample four corners, then interpolate */ \ + vec4 p = pos.xyxy + $pt.xyxy * (h + off.xyxy); \ + vec4 c00 = textureLod($tex, p.xy, 0.0); \ + vec4 c01 = textureLod($tex, p.xw, 0.0); \ + vec4 c0 = mix(c01, c00, g.y); \ + vec4 c10 = textureLod($tex, p.zy, 0.0); \ + vec4 c11 = textureLod($tex, p.zw, 0.0); \ + vec4 c1 = mix(c11, c10, g.y); \ + color = ${float:scale} * mix(c1, c0, g.x); \ + } + + return true; +} + +bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src, + float threshold) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + threshold = PL_CLAMP(threshold, 0.0f, 0.5f); + sh_describe(sh, "oversample"); + #pragma GLSL /* pl_shader_sample_oversample */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + /* Round the position to the nearest pixel */ \ + vec2 fcoord = fract(pos * size - vec2(0.5)); \ + float rx = ${dynamic float:rx}; \ + float ry = ${dynamic float:ry}; \ + vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry); \ + coeff = clamp(coeff + vec2(0.5), 0.0, 1.0); \ + @if (threshold > 0) { \ + float thresh = ${float:threshold}; \ + coeff = mix(coeff, vec2(0.0), \ + lessThan(coeff, vec2(thresh))); \ + coeff = mix(coeff, vec2(1.0), \ + greaterThan(coeff, vec2(1.0 - thresh))); \ + @} \ + \ + /* Compute the right output blend of colors */ \ + pos += (coeff - fcoord) * $pt; \ + color = ${float:scale} * textureLod($tex, pos, 0.0); \ + } + + return true; +} + +static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg, + const char *stage, float rx, float ry) +{ + const char *dir; + if (rx > 1 && ry > 1) { + dir = "up"; + } else if (rx < 1 && ry < 1) { + dir = "down"; + } else if (rx == 1 && ry == 1) { + dir = "noop"; + } else { + dir = "ana"; + } + + if (cfg->name) { + sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name); + } else if (cfg->window) { + sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir, + PL_DEF(cfg->kernel->name, "unknown"), + PL_DEF(cfg->window->name, "unknown")); + } else { + sh_describef(sh, "%s %sscaling (%s)", stage, dir, + PL_DEF(cfg->kernel->name, "unknown")); + } +} + +// Subroutine for computing and adding an individual texel contribution +// If `in` is NULL, samples directly +// If `in` is set, takes the pixel from inX[idx] where X is the component, +// `in` is the given identifier, and `idx` must be defined by the caller +static void polar_sample(pl_shader sh, pl_filter filter, + ident_t tex, ident_t lut, ident_t radius, + int x, int y, uint8_t comp_mask, ident_t in, + bool use_ar, ident_t scale) +{ + // Since we can't know the subpixel position in advance, assume a + // worst case scenario + int yy = y > 0 ? y-1 : y; + int xx = x > 0 ? x-1 : x; + float dmin = sqrt(xx*xx + yy*yy); + // Skip samples definitely outside the radius + if (dmin >= filter->radius) + return; + + // Check for samples that might be skippable + bool maybe_skippable = dmin >= filter->radius - M_SQRT2; + + // Check for samples that definitely won't contribute to anti-ringing + const float ar_radius = filter->radius_zero; + use_ar &= dmin < ar_radius; + +#pragma GLSL \ + offset = ivec2(${const int: x}, ${const int: y}); \ + d = length(vec2(offset) - fcoord); \ + @if (maybe_skippable) \ + if (d < $radius) { \ + w = $lut(d * 1.0 / $radius); \ + wsum += w; \ + @if (in != NULL_IDENT) { \ + @for (c : comp_mask) \ + c[@c] = ${in}_@c[idx]; \ + @} else { \ + c = textureLod($tex, base + pt * vec2(offset), 0.0); \ + @} \ + @for (c : comp_mask) \ + color[@c] += w * c[@c]; \ + @if (use_ar) { \ + if (d <= ${const float: ar_radius}) { \ + @for (c : comp_mask) { \ + cc = vec2($scale * c[@c]); \ + cc.x = 1.0 - cc.x; \ + ww = cc + vec2(0.10); \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = w * ww; \ + ar@c += ww * cc; \ + wwsum@c += ww; \ + @} \ + } \ + @} \ + @if (maybe_skippable) \ + } +} + +struct sh_sampler_obj { + pl_filter filter; + pl_shader_obj lut; + pl_shader_obj pass2; // for pl_shader_sample_ortho +}; + +#define SCALER_LUT_SIZE 256 +#define SCALER_LUT_CUTOFF 1e-3f + +static void sh_sampler_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_sampler_obj *obj = ptr; + pl_shader_obj_destroy(&obj->lut); + pl_shader_obj_destroy(&obj->pass2); + pl_filter_free(&obj->filter); + *obj = (struct sh_sampler_obj) {0}; +} + +static void fill_polar_lut(void *data, const struct sh_lut_params *params) +{ + const struct sh_sampler_obj *obj = params->priv; + pl_filter filt = obj->filter; + + pl_assert(params->width == filt->params.lut_entries && params->comps == 1); + memcpy(data, filt->weights, params->width * sizeof(float)); +} + +bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src, + const struct pl_sample_filter_params *params) +{ + pl_assert(params); + if (!params->filter.polar) { + SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?"); + return false; + } + + uint8_t cmask; + float rx, ry, scalef; + ident_t src_tex, pos, pt, scale; + if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST)) + return false; + + struct sh_sampler_obj *obj; + obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj, + sh_sampler_uninit); + if (!obj) + return false; + + float inv_scale = 1.0 / PL_MIN(rx, ry); + inv_scale = PL_MAX(inv_scale, 1.0); + if (params->no_widening) + inv_scale = 1.0; + scale = sh_const_float(sh, "scale", scalef); + + struct pl_filter_config cfg = params->filter; + cfg.antiring = PL_DEF(cfg.antiring, params->antiring); + cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale; + bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg); + if (update) { + pl_filter_free(&obj->filter); + obj->filter = pl_filter_generate(sh->log, pl_filter_params( + .config = cfg, + .lut_entries = SCALER_LUT_SIZE, + .cutoff = SCALER_LUT_CUTOFF, + )); + + if (!obj->filter) { + // This should never happen, but just in case .. + SH_FAIL(sh, "Failed initializing polar filter!"); + return false; + } + } + + describe_filter(sh, &cfg, "polar", rx, ry); + GLSL("// pl_shader_sample_polar \n" + "vec4 color = vec4(0.0); \n" + "{ \n" + "vec2 pos = "$", pt = "$"; \n" + "vec2 size = vec2(textureSize("$", 0)); \n" + "vec2 fcoord = fract(pos * size - vec2(0.5)); \n" + "vec2 base = pos - pt * fcoord; \n" + "vec2 center = base + pt * vec2(0.5); \n" + "ivec2 offset; \n" + "float w, d, wsum = 0.0; \n" + "int idx; \n" + "vec4 c; \n", + pos, pt, src_tex); + + bool use_ar = cfg.antiring > 0; + if (use_ar) { +#pragma GLSL \ + vec2 ww, cc; \ + @for (c : cmask) \ + vec2 ar@c = vec2(0.0), wwsum@c = vec2(0.0); + } + + int bound = ceil(obj->filter->radius); + int offset = bound - 1; // padding top/left + int padding = offset + bound; // total padding + + // Determined experimentally on modern AMD and Nvidia hardware. 32 is a + // good tradeoff for the horizontal work group size. Apart from that, + // just use as many threads as possible. + const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw; + + // We need to sample everything from base_min to base_max, so make sure we + // have enough room in shmem. The extra margin on the ceilf guards against + // floating point inaccuracy on near-integer scaling ratios. + const float margin = 1e-5; + int iw = (int) ceilf(bw / rx - margin) + padding + 1, + ih = (int) ceilf(bh / ry - margin) + padding + 1; + int sizew = iw, sizeh = ih; + + pl_gpu gpu = SH_GPU(sh); + bool dynamic_size = SH_PARAMS(sh).dynamic_constants || + !gpu || !gpu->limits.array_size_constants; + if (dynamic_size) { + // Overallocate the array slightly to reduce recompilation overhead + sizew = PL_ALIGN2(sizew, 8); + sizeh = PL_ALIGN2(sizeh, 8); + } + + int num_comps = __builtin_popcount(cmask); + int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float); + bool is_compute = !params->no_compute && sh_glsl(sh).compute && + sh_try_compute(sh, bw, bh, false, shmem_req); + + // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by + // much, and it's catastrophically slow on other platforms. + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->lut, + .lut_type = SH_LUT_TEXTURE, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_LINEAR, + .width = SCALER_LUT_SIZE, + .comps = 1, + .update = update, + .fill = fill_polar_lut, + .priv = obj, + )); + + if (!lut) { + SH_FAIL(sh, "Failed initializing polar LUT!"); + return false; + } + + ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius); + ident_t in = sh_fresh(sh, "in"); + + if (is_compute) { + + // Compute shader kernel + GLSL("uvec2 base_id = uvec2(0u); \n"); + if (src->rect.x0 > src->rect.x1) + GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n"); + if (src->rect.y0 > src->rect.y1) + GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n"); + + GLSLH("shared vec2 "$"_base; \n", in); + GLSL("if (gl_LocalInvocationID.xy == base_id) \n" + " "$"_base = base; \n" + "barrier(); \n" + "ivec2 rel = ivec2(round((base - "$"_base) * size)); \n", + in, in); + + ident_t sizew_c = sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_SINT, + .compile_time = true, + .name = "sizew", + .data = &sizew, + }); + + ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_SINT, + .compile_time = true, + .name = "sizeh", + .data = &sizeh, + }); + + ident_t iw_c = sizew_c, ih_c = sizeh_c; + if (dynamic_size) { + iw_c = sh_const_int(sh, "iw", iw); + ih_c = sh_const_int(sh, "ih", ih); + } + + // Load all relevant texels into shmem + GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) { \n" + "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) { \n" + "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0); \n", + ih_c, bh, iw_c, bw, src_tex, in, offset, offset); + + for (uint8_t comps = cmask; comps;) { + uint8_t c = __builtin_ctz(comps); + GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c); + GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c); + comps &= ~(1 << c); + } + + GLSL("}} \n" + "barrier(); \n"); + + // Dispatch the actual samples + for (int y = 1 - bound; y <= bound; y++) { + for (int x = 1 - bound; x <= bound; x++) { + GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n", + sizew_c, sizew_c, y + offset, x + offset); + polar_sample(sh, obj->filter, src_tex, lut, radius_c, + x, y, cmask, in, use_ar, scale); + } + } + } else { + // Fragment shader sampling + for (uint8_t comps = cmask; comps;) { + uint8_t c = __builtin_ctz(comps); + GLSL("vec4 "$"_%d; \n", in, c); + comps &= ~(1 << c); + } + + // For maximum efficiency, we want to use textureGather() if + // possible, rather than direct sampling. Since this is not + // always possible/sensible, we need to possibly intermix gathering + // with regular sampling. This requires keeping track of which + // pixels in the next row were already gathered by the previous + // row. + uint32_t gathered_cur = 0x0, gathered_next = 0x0; + const float radius2 = PL_SQUARE(obj->filter->radius); + const int base = bound - 1; + + if (base + bound >= 8 * sizeof(gathered_cur)) { + SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!", + obj->filter->radius); + return false; + } + + for (int y = 1 - bound; y <= bound; y++) { + for (int x = 1 - bound; x <= bound; x++) { + // Skip already gathered texels + uint32_t bit = 1llu << (base + x); + if (gathered_cur & bit) + continue; + + // Using texture gathering is only more efficient than direct + // sampling in the case where we expect to be able to use all + // four gathered texels, without having to discard any. So + // only do it if we suspect it will be a win rather than a + // loss. + int xx = x*x, xx1 = (x+1)*(x+1); + int yy = y*y, yy1 = (y+1)*(y+1); + bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2; + use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset; + use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset; + use_gather &= !src->tex || src->tex->params.format->gatherable; + + // Gathering from components other than the R channel requires + // support for GLSL 400, which introduces the overload of + // textureGather* that allows specifying the component. + // + // This is also the minimum requirement if we don't know the + // texture format capabilities, for the sampler2D interface + if (cmask != 0x1 || !src->tex) + use_gather &= sh_glsl(sh).version >= 400; + + if (!use_gather) { + // Switch to direct sampling instead + polar_sample(sh, obj->filter, src_tex, lut, radius_c, + x, y, cmask, NULL_IDENT, use_ar, scale); + continue; + } + + // Gather the four surrounding texels simultaneously + for (uint8_t comps = cmask; comps;) { + uint8_t c = __builtin_ctz(comps); + if (x || y) { + if (c) { + GLSL($"_%d = textureGatherOffset("$", " + "center, ivec2(%d, %d), %d); \n", + in, c, src_tex, x, y, c); + } else { + GLSL($"_0 = textureGatherOffset("$", " + "center, ivec2(%d, %d)); \n", + in, src_tex, x, y); + } + } else { + if (c) { + GLSL($"_%d = textureGather("$", center, %d); \n", + in, c, src_tex, c); + } else { + GLSL($"_0 = textureGather("$", center); \n", + in, src_tex); + } + } + comps &= ~(1 << c); + } + + // Mix in all of the points with their weights + for (int p = 0; p < 4; p++) { + // The four texels are gathered counterclockwise starting + // from the bottom left + static const int xo[4] = {0, 1, 1, 0}; + static const int yo[4] = {1, 1, 0, 0}; + if (x+xo[p] > bound || y+yo[p] > bound) + continue; // next subpixel + + GLSL("idx = %d;\n", p); + polar_sample(sh, obj->filter, src_tex, lut, radius_c, + x+xo[p], y+yo[p], cmask, in, use_ar, scale); + } + + // Mark the other next row's pixels as already gathered + gathered_next |= bit | (bit << 1); + x++; // skip adjacent pixel + } + + // Prepare for new row + gathered_cur = gathered_next; + gathered_next = 0; + } + } + +#pragma GLSL \ + color = $scale / wsum * color; \ + @if (use_ar) { \ + @for (c : cmask) { \ + ww = ar@c / wwsum@c; \ + ww.x = 1.0 - ww.x; \ + w = clamp(color[@c], ww.x, ww.y); \ + w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y); \ + color[@c] = mix(color[@c], w, ${float:cfg.antiring}); \ + @} \ + @} \ + @if (!(cmask & (1 << PL_CHANNEL_A))) \ + color.a = 1.0; \ + } + + return true; +} + +static void fill_ortho_lut(void *data, const struct sh_lut_params *params) +{ + const struct sh_sampler_obj *obj = params->priv; + pl_filter filt = obj->filter; + + if (filt->radius == filt->radius_zero) { + // Main lobe covers entire radius, so all weights are positive, meaning + // we can use the linear resampling trick + for (int n = 0; n < SCALER_LUT_SIZE; n++) { + const float *weights = filt->weights + n * filt->row_stride; + float *row = (float *) data + n * filt->row_stride; + pl_assert(filt->row_size % 2 == 0); + for (int i = 0; i < filt->row_size; i += 2) { + const float w0 = weights[i], w1 = weights[i+1]; + assert(w0 + w1 >= 0.0f); + row[i] = w0 + w1; + row[i+1] = w1 / (w0 + w1); + } + } + } else { + size_t entries = SCALER_LUT_SIZE * filt->row_stride; + pl_assert(params->width * params->height * params->comps == entries); + memcpy(data, filt->weights, entries * sizeof(float)); + } +} + +enum { + SEP_VERT = 0, + SEP_HORIZ, + SEP_PASSES +}; + +bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src, + const struct pl_sample_filter_params *params) +{ + pl_assert(params); + if (params->filter.polar) { + SH_FAIL(sh, "Trying to use separated sampling with a polar filter?"); + return false; + } + + pl_gpu gpu = SH_GPU(sh); + pl_assert(gpu); + + uint8_t comps; + float ratio[SEP_PASSES], scale; + ident_t src_tex, pos, pt; + if (!setup_src(sh, src, &src_tex, &pos, &pt, + &ratio[SEP_HORIZ], &ratio[SEP_VERT], + &comps, &scale, false, LINEAR)) + return false; + + + int pass; + if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) { + pass = SEP_VERT; + } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) { + pass = SEP_HORIZ; + } else { + SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a " + "pl_sample_src that requires scaling in multiple directions " + "(rx=%f, ry=%f), this is not possible!", + ratio[SEP_HORIZ], ratio[SEP_VERT]); + return false; + } + + // We can store a separate sampler object per dimension, so dispatch the + // right one. This is needed for two reasons: + // 1. Anamorphic content can have a different scaling ratio for each + // dimension. In particular, you could be upscaling in one and + // downscaling in the other. + // 2. After fixing the source for `setup_src`, we lose information about + // the scaling ratio of the other component. (Although this is only a + // minor reason and could easily be changed with some boilerplate) + struct sh_sampler_obj *obj; + obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, + struct sh_sampler_obj, sh_sampler_uninit); + if (!obj) + return false; + + if (pass != 0) { + obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER, + struct sh_sampler_obj, sh_sampler_uninit); + assert(obj); + } + + float inv_scale = 1.0 / ratio[pass]; + inv_scale = PL_MAX(inv_scale, 1.0); + if (params->no_widening) + inv_scale = 1.0; + + struct pl_filter_config cfg = params->filter; + cfg.antiring = PL_DEF(cfg.antiring, params->antiring); + cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale; + bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg); + + if (update) { + pl_filter_free(&obj->filter); + obj->filter = pl_filter_generate(sh->log, pl_filter_params( + .config = cfg, + .lut_entries = SCALER_LUT_SIZE, + .max_row_size = gpu->limits.max_tex_2d_dim / 4, + .row_stride_align = 4, + )); + + if (!obj->filter) { + // This should never happen, but just in case .. + SH_FAIL(sh, "Failed initializing separated filter!"); + return false; + } + } + + int N = obj->filter->row_size; // number of samples to convolve + int width = obj->filter->row_stride / 4; // width of the LUT texture + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->lut, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_LINEAR, + .width = width, + .height = SCALER_LUT_SIZE, + .comps = 4, + .update = update, + .fill = fill_ortho_lut, + .priv = obj, + )); + if (!lut) { + SH_FAIL(sh, "Failed initializing separated LUT!"); + return false; + } + + const int dir[SEP_PASSES][2] = { + [SEP_HORIZ] = {1, 0}, + [SEP_VERT] = {0, 1}, + }; + + static const char *names[SEP_PASSES] = { + [SEP_HORIZ] = "ortho (horiz)", + [SEP_VERT] = "ortho (vert)", + }; + + describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]); + + float denom = PL_MAX(1, width - 1); // avoid division by zero + bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0; + bool use_linear = obj->filter->radius == obj->filter->radius_zero; + use_ar &= !use_linear; // filter has no negative weights + +#pragma GLSL /* pl_shader_sample_ortho */ \ + vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \ + { \ + vec2 pos = $pos, pt = $pt; \ + vec2 size = vec2(textureSize($src_tex, 0)); \ + vec2 dir = vec2(${const float:dir[pass][0]}, ${const float: dir[pass][1]}); \ + pt *= dir; \ + vec2 fcoord2 = fract(pos * size - vec2(0.5)); \ + float fcoord = dot(fcoord2, dir); \ + vec2 base = pos - fcoord * pt - pt * vec2(${const float: N / 2 - 1}); \ + vec4 ws; \ + float off; \ + ${vecType: comps} c, ca = ${vecType: comps}(0.0); \ + @if (use_ar) { \ + ${vecType: comps} hi = ${vecType: comps}(0.0); \ + ${vecType: comps} lo = ${vecType: comps}(1e9); \ + @} \ + @for (n < N) { \ + @if @(n % 4 == 0) \ + ws = $lut(vec2(float(@n / 4) / ${const float: denom}, fcoord)); \ + @if @(vars.use_ar && (n == vars.n / 2 - 1 || n == vars.n / 2)) { \ + c = textureLod($src_tex, base + pt * @n.0, 0.0).${swizzle: comps}; \ + ca += ws[@n % 4] * c; \ + lo = min(lo, c); \ + hi = max(hi, c); \ + @} else { \ + @if (use_linear) { \ + @if @(n % 2 == 0) { \ + off = @n.0 + ws[@n % 4 + 1]; \ + ca += ws[@n % 4] * textureLod($src_tex, base + pt * off, \ + 0.0).${swizzle: comps}; \ + @} \ + @} else { \ + ca += ws[@n % 4] * textureLod($src_tex, base + pt * @n.0, \ + 0.0).${swizzle: comps}; \ + @} \ + @} \ + @} \ + @if (use_ar) \ + ca = mix(ca, clamp(ca, lo, hi), ${float: cfg.antiring}); \ + color.${swizzle: comps} = ${float: scale} * ca; \ + } + + return true; +} + +const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS }; + +void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h, + const struct pl_distort_params *params) +{ + pl_assert(params); + if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h)) + return; + + const int src_w = src_tex->params.w, src_h = src_tex->params.h; + float rx = 1.0f, ry = 1.0f; + if (src_w > src_h) { + ry = (float) src_h / src_w; + } else { + rx = (float) src_w / src_h; + } + + // Map from texel coordinates [0,1]² to aspect-normalized representation + const pl_transform2x2 tex2norm = { + .mat.m = { + { 2 * rx, 0 }, + { 0, -2 * ry }, + }, + .c = { -rx, ry }, + }; + + // Map from aspect-normalized representation to canvas coords [-1,1]² + const float sx = params->unscaled ? (float) src_w / out_w : 1.0f; + const float sy = params->unscaled ? (float) src_h / out_h : 1.0f; + const pl_transform2x2 norm2canvas = { + .mat.m = { + { sx / rx, 0 }, + { 0, sy / ry }, + }, + }; + + struct pl_transform2x2 transform = params->transform; + pl_transform2x2_mul(&transform, &tex2norm); + pl_transform2x2_rmul(&norm2canvas, &transform); + + if (params->constrain) { + pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) { + .x1 = 1, .y1 = 1, + }); + const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f); + pl_transform2x2_scale(&transform, 2.0f / k); + }; + + // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond + // to normal mathematical axis conventions + static const pl_rect2df canvas = { + .x0 = -1.0f, .x1 = 1.0f, + .y0 = 1.0f, .y1 = -1.0f, + }; + + ident_t pos = sh_attr_vec2(sh, "pos", &canvas); + ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode, + PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt); + + // Bind the inverse of the tex2canvas transform (i.e. canvas2tex) + pl_transform2x2_invert(&transform); + ident_t tf = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat2("tf"), + .data = PL_TRANSPOSE_2X2(transform.mat.m), + }); + + ident_t tf_c = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tf_c"), + .data = transform.c, + }); + + // See pl_shader_sample_bicubic + sh_describe(sh, "distortion"); +#pragma GLSL /* pl_shader_sample_distort */ \ + vec4 color; \ + { \ + vec2 pos = $tf * $pos + $tf_c; \ + vec2 pt = $pt; \ + @if (params->bicubic) { \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 frac = fract(pos * size + vec2(0.5)); \ + vec2 frac2 = frac * frac; \ + vec2 inv = vec2(1.0) - frac; \ + vec2 inv2 = inv * inv; \ + vec2 w0 = 1.0/6.0 * inv2 * inv; \ + vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \ + vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \ + vec2 w3 = 1.0/6.0 * frac2 * frac; \ + vec4 g = vec4(w0 + w1, w2 + w3); \ + vec4 h = vec4(w1, w3) / g + inv.xyxy; \ + h.xy -= vec2(2.0); \ + vec4 p = pos.xyxy + pt.xyxy * h; \ + vec4 c00 = textureLod($tex, p.xy, 0.0); \ + vec4 c01 = textureLod($tex, p.xw, 0.0); \ + vec4 c0 = mix(c01, c00, g.y); \ + vec4 c10 = textureLod($tex, p.zy, 0.0); \ + vec4 c11 = textureLod($tex, p.zw, 0.0); \ + vec4 c1 = mix(c11, c10, g.y); \ + color = mix(c1, c0, g.x); \ + @} else { \ + color = texture($tex, pos); \ + @} \ + @if (params->alpha_mode) { \ + vec2 border = min(pos, vec2(1.0) - pos); \ + border = smoothstep(vec2(0.0), pt, border); \ + @if (params->alpha_mode == PL_ALPHA_PREMULTIPLIED) \ + color.rgba *= border.x * border.y; \ + @else \ + color.a *= border.x * border.y; \ + @} \ + } + +} diff --git a/src/swapchain.c b/src/swapchain.c new file mode 100644 index 0000000..2b9ed90 --- /dev/null +++ b/src/swapchain.c @@ -0,0 +1,92 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "log.h" +#include "swapchain.h" + +void pl_swapchain_destroy(pl_swapchain *ptr) +{ + pl_swapchain sw = *ptr; + if (!sw) + return; + + const struct pl_sw_fns *impl = PL_PRIV(sw); + impl->destroy(sw); + *ptr = NULL; +} + +int pl_swapchain_latency(pl_swapchain sw) +{ + const struct pl_sw_fns *impl = PL_PRIV(sw); + if (!impl->latency) + return 0; + + return impl->latency(sw); +} + +bool pl_swapchain_resize(pl_swapchain sw, int *width, int *height) +{ + int dummy[2] = {0}; + width = PL_DEF(width, &dummy[0]); + height = PL_DEF(height, &dummy[1]); + + const struct pl_sw_fns *impl = PL_PRIV(sw); + if (!impl->resize) { + *width = *height = 0; + return true; + } + + return impl->resize(sw, width, height); +} + +void pl_swapchain_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp) +{ + const struct pl_sw_fns *impl = PL_PRIV(sw); + if (!impl->colorspace_hint) + return; + + struct pl_swapchain_colors fix = {0}; + if (csp) { + fix = *csp; + // Ensure we have valid values set for all the fields + pl_color_space_infer(&fix); + } + + impl->colorspace_hint(sw, &fix); +} + +bool pl_swapchain_start_frame(pl_swapchain sw, + struct pl_swapchain_frame *out_frame) +{ + *out_frame = (struct pl_swapchain_frame) {0}; // sanity + + const struct pl_sw_fns *impl = PL_PRIV(sw); + return impl->start_frame(sw, out_frame); +} + +bool pl_swapchain_submit_frame(pl_swapchain sw) +{ + const struct pl_sw_fns *impl = PL_PRIV(sw); + return impl->submit_frame(sw); +} + +void pl_swapchain_swap_buffers(pl_swapchain sw) +{ + const struct pl_sw_fns *impl = PL_PRIV(sw); + impl->swap_buffers(sw); +} diff --git a/src/swapchain.h b/src/swapchain.h new file mode 100644 index 0000000..934a2b9 --- /dev/null +++ b/src/swapchain.h @@ -0,0 +1,39 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +#include <libplacebo/swapchain.h> + +// This struct must be the first member of the swapchains's priv struct. The +// `pl_swapchain` helpers will cast the priv struct to this struct! + +#define SW_PFN(name) __typeof__(pl_swapchain_##name) *name +struct pl_sw_fns { + // This destructor follows the same rules as `pl_gpu_fns` + void (*destroy)(pl_swapchain sw); + + SW_PFN(latency); // optional + SW_PFN(resize); // optional + SW_PFN(colorspace_hint); // optional + SW_PFN(start_frame); + SW_PFN(submit_frame); + SW_PFN(swap_buffers); +}; +#undef SW_PFN diff --git a/src/tests/bench.c b/src/tests/bench.c new file mode 100644 index 0000000..22638d8 --- /dev/null +++ b/src/tests/bench.c @@ -0,0 +1,550 @@ +#include "tests.h" + +#include <libplacebo/dispatch.h> +#include <libplacebo/vulkan.h> +#include <libplacebo/shaders/colorspace.h> +#include <libplacebo/shaders/deinterlacing.h> +#include <libplacebo/shaders/sampling.h> + +enum { + // Image configuration + NUM_TEX = 16, + WIDTH = 2048, + HEIGHT = 2048, + DEPTH = 16, + COMPS = 4, + + // Queue configuration + NUM_QUEUES = NUM_TEX, + ASYNC_TX = 1, + ASYNC_COMP = 1, + + // Test configuration + TEST_MS = 1000, + WARMUP_MS = 500, +}; + +static pl_tex create_test_img(pl_gpu gpu) +{ + pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32, PL_FMT_CAP_LINEAR); + REQUIRE(fmt); + + const float xc = (WIDTH - 1) / 2.0f; + const float yc = (HEIGHT - 1) / 2.0f; + const float kf = 0.5f / sqrtf(xc * xc + yc * yc); + const float invphi = 0.61803398874989; + const float freqR = kf * M_PI * 0.2f; + const float freqG = freqR * invphi; + const float freqB = freqG * invphi; + float *data = malloc(WIDTH * HEIGHT * COMPS * sizeof(float)); + for (int y = 0; y < HEIGHT; y++) { + for (int x = 0; x < WIDTH; x++) { + float *color = &data[(y * WIDTH + x) * COMPS]; + float xx = x - xc, yy = y - yc; + float r2 = xx * xx + yy * yy; + switch (COMPS) { + case 4: color[3] = 1.0; + case 3: color[2] = 0.5f * sinf(freqB * r2) + 0.5f;; + case 2: color[1] = 0.5f * sinf(freqG * r2) + 0.5f;; + case 1: color[0] = 0.5f * sinf(freqR * r2) + 0.5f;; + } + } + } + + pl_tex tex = pl_tex_create(gpu, pl_tex_params( + .format = fmt, + .w = WIDTH, + .h = HEIGHT, + .sampleable = true, + .initial_data = data, + )); + + free(data); + REQUIRE(tex); + return tex; +} + +struct bench { + void (*run_sh)(pl_shader sh, pl_shader_obj *state, + pl_tex src); + + void (*run_tex)(pl_gpu gpu, pl_tex tex); +}; + +static void run_bench(pl_gpu gpu, pl_dispatch dp, + pl_shader_obj *state, pl_tex src, + pl_tex fbo, pl_timer timer, + const struct bench *bench) +{ + REQUIRE(bench); + REQUIRE(bench->run_sh || bench->run_tex); + if (bench->run_sh) { + pl_shader sh = pl_dispatch_begin(dp); + bench->run_sh(sh, state, src); + + pl_dispatch_finish(dp, pl_dispatch_params( + .shader = &sh, + .target = fbo, + .timer = timer, + )); + } else { + bench->run_tex(gpu, fbo); + } +} + +static void benchmark(pl_gpu gpu, const char *name, + const struct bench *bench) +{ + pl_dispatch dp = pl_dispatch_create(gpu->log, gpu); + REQUIRE(dp); + pl_shader_obj state = NULL; + pl_tex src = create_test_img(gpu); + + // Create the FBOs + pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32, + PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE); + REQUIRE(fmt); + + pl_tex fbos[NUM_TEX] = {0}; + for (int i = 0; i < NUM_TEX; i++) { + fbos[i] = pl_tex_create(gpu, pl_tex_params( + .format = fmt, + .w = WIDTH, + .h = HEIGHT, + .renderable = true, + .blit_dst = true, + .host_writable = true, + .host_readable = true, + .storable = !!(fmt->caps & PL_FMT_CAP_STORABLE), + )); + REQUIRE(fbos[i]); + + pl_tex_clear(gpu, fbos[i], (float[4]){ 0.0 }); + } + + // Run the benchmark and flush+block once to force shader compilation etc. + run_bench(gpu, dp, &state, src, fbos[0], NULL, bench); + pl_gpu_finish(gpu); + + // Perform the actual benchmark + pl_clock_t start_warmup = 0, start_test = 0; + unsigned long frames = 0, frames_warmup = 0; + + pl_timer timer = pl_timer_create(gpu); + uint64_t gputime_total = 0; + unsigned long gputime_count = 0; + uint64_t gputime; + + start_warmup = pl_clock_now(); + do { + const int idx = frames % NUM_TEX; + while (pl_tex_poll(gpu, fbos[idx], UINT64_MAX)) + ; // do nothing + run_bench(gpu, dp, &state, src, fbos[idx], start_test ? timer : NULL, bench); + pl_gpu_flush(gpu); + frames++; + + if (start_test) { + while ((gputime = pl_timer_query(gpu, timer))) { + gputime_total += gputime; + gputime_count++; + } + } + + pl_clock_t now = pl_clock_now(); + if (start_test) { + if (pl_clock_diff(now, start_test) > TEST_MS * 1e-3) + break; + } else if (pl_clock_diff(now, start_warmup) > WARMUP_MS * 1e-3) { + start_test = now; + frames_warmup = frames; + } + } while (true); + + // Force the GPU to finish execution and re-measure the final stop time + pl_gpu_finish(gpu); + + pl_clock_t stop = pl_clock_now(); + while ((gputime = pl_timer_query(gpu, timer))) { + gputime_total += gputime; + gputime_count++; + } + + frames -= frames_warmup; + double secs = pl_clock_diff(stop, start_test); + printf("'%s':\t%4lu frames in %1.6f seconds => %2.6f ms/frame (%5.2f FPS)", + name, frames, secs, 1000 * secs / frames, frames / secs); + if (gputime_count) + printf(", gpu time: %2.6f ms", 1e-6 * gputime_total / gputime_count); + printf("\n"); + + pl_timer_destroy(gpu, &timer); + pl_shader_obj_destroy(&state); + pl_dispatch_destroy(&dp); + pl_tex_destroy(gpu, &src); + for (int i = 0; i < NUM_TEX; i++) + pl_tex_destroy(gpu, &fbos[i]); +} + +// List of benchmarks +static void bench_deband(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL); +} + +static void bench_deband_heavy(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params( + .iterations = 4, + .threshold = 4.0, + .radius = 4.0, + .grain = 16.0, + )); +} + +static void bench_bilinear(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_bilinear(sh, pl_sample_src( .tex = src ))); +} + +static void bench_bicubic(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_bicubic(sh, pl_sample_src( .tex = src ))); +} + +static void bench_hermite(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_hermite(sh, pl_sample_src( .tex = src ))); +} + +static void bench_gaussian(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_gaussian(sh, pl_sample_src( .tex = src ))); +} + +static void bench_dither_blue(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + pl_shader_dither(sh, 8, state, pl_dither_params( + .method = PL_DITHER_BLUE_NOISE, + )); +} + +static void bench_dither_white(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + pl_shader_dither(sh, 8, state, pl_dither_params( + .method = PL_DITHER_WHITE_NOISE, + )); +} + +static void bench_dither_ordered_fix(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + pl_shader_dither(sh, 8, state, pl_dither_params( + .method = PL_DITHER_ORDERED_FIXED, + )); +} + +static void bench_polar(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_sample_filter_params params = { + .filter = pl_filter_ewa_lanczos, + .lut = state, + }; + + REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), ¶ms)); +} + +static void bench_polar_nocompute(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_sample_filter_params params = { + .filter = pl_filter_ewa_lanczos, + .no_compute = true, + .lut = state, + }; + + REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), ¶ms)); +} + +static void bench_hdr_peak(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_default_params)); +} + +static void bench_hdr_peak_hq(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_high_quality_params)); +} + +static void bench_hdr_lut(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_color_map_params params = { + PL_COLOR_MAP_DEFAULTS + .tone_mapping_function = &pl_tone_map_bt2390, + .tone_mapping_mode = PL_TONE_MAP_RGB, + }; + + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + pl_shader_color_map_ex(sh, ¶ms, pl_color_map_args( + .src = pl_color_space_hdr10, + .dst = pl_color_space_monitor, + .state = state, + )); +} + +static void bench_hdr_clip(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_color_map_params params = { + PL_COLOR_MAP_DEFAULTS + .tone_mapping_function = &pl_tone_map_clip, + .tone_mapping_mode = PL_TONE_MAP_RGB, + }; + + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + pl_shader_color_map_ex(sh, ¶ms, pl_color_map_args( + .src = pl_color_space_hdr10, + .dst = pl_color_space_monitor, + .state = state, + )); +} + +static void bench_weave(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_deinterlace_source dsrc = { + .cur = pl_field_pair(src), + .field = PL_FIELD_TOP, + }; + + pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params( + .algo = PL_DEINTERLACE_WEAVE, + )); +} + +static void bench_bob(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_deinterlace_source dsrc = { + .cur = pl_field_pair(src), + .field = PL_FIELD_TOP, + }; + + pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params( + .algo = PL_DEINTERLACE_BOB, + )); +} + +static void bench_yadif(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_deinterlace_source dsrc = { + .prev = pl_field_pair(src), + .cur = pl_field_pair(src), + .next = pl_field_pair(src), + .field = PL_FIELD_TOP, + }; + + pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params( + .algo = PL_DEINTERLACE_YADIF, + )); +} + +static void bench_av1_grain(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_film_grain_params params = { + .data = { + .type = PL_FILM_GRAIN_AV1, + .params.av1 = av1_grain_data, + .seed = rand(), + }, + .tex = src, + .components = 3, + .component_mapping = {0, 1, 2}, + .repr = &(struct pl_color_repr) {0}, + }; + + REQUIRE(pl_shader_film_grain(sh, state, ¶ms)); +} + +static void bench_av1_grain_lap(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_film_grain_params params = { + .data = { + .type = PL_FILM_GRAIN_AV1, + .params.av1 = av1_grain_data, + .seed = rand(), + }, + .tex = src, + .components = 3, + .component_mapping = {0, 1, 2}, + .repr = &(struct pl_color_repr) {0}, + }; + + params.data.params.av1.overlap = true; + REQUIRE(pl_shader_film_grain(sh, state, ¶ms)); +} + +static void bench_h274_grain(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + struct pl_film_grain_params params = { + .data = { + .type = PL_FILM_GRAIN_H274, + .params.h274 = h274_grain_data, + .seed = rand(), + }, + .tex = src, + .components = 3, + .component_mapping = {0, 1, 2}, + .repr = &(struct pl_color_repr) {0}, + }; + + REQUIRE(pl_shader_film_grain(sh, state, ¶ms)); +} + +static void bench_reshape_poly(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + pl_shader_dovi_reshape(sh, &(struct pl_dovi_metadata) { .comp = { + { + .num_pivots = 8, + .pivots = {0.0, 0.00488758553, 0.0420332365, 0.177908108, + 0.428152502, 0.678396881, 0.92864126, 1.0}, + .method = {0, 0, 0, 0, 0, 0, 0}, + .poly_coeffs = { + {0.00290930271, 2.30019712, 50.1446037}, + {0.00725257397, 1.88119054, -4.49443769}, + {0.0150123835, 1.61106598, -1.64833081}, + {0.0498571396, 1.2059114, -0.430627108}, + {0.0878019333, 1.01845241, -0.19669354}, + {0.120447636, 0.920134187, -0.122338772}, + {2.12430835, -3.30913281, 2.10893941}, + }, + }, { + .num_pivots = 2, + .pivots = {0.0, 1.0}, + .method = {0}, + .poly_coeffs = {{-0.397901177, 1.85908031, 0}}, + }, { + .num_pivots = 2, + .pivots = {0.0, 1.0}, + .method = {0}, + .poly_coeffs = {{-0.399355531, 1.85591626, 0}}, + }, + }}); +} + +static void bench_reshape_mmr(pl_shader sh, pl_shader_obj *state, pl_tex src) +{ + REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src ))); + pl_shader_dovi_reshape(sh, &dovi_meta); // this includes MMR +} + +static float data[WIDTH * HEIGHT * COMPS + 8192]; + +static void bench_download(pl_gpu gpu, pl_tex tex) +{ + REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params( + .tex = tex, + .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096), + ))); +} + +static void bench_upload(pl_gpu gpu, pl_tex tex) +{ + REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params( + .tex = tex, + .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096), + ))); +} + +static void dummy_cb(void *arg) {} + +static void bench_download_async(pl_gpu gpu, pl_tex tex) +{ + REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params( + .tex = tex, + .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096), + .callback = dummy_cb, + ))); +} + +static void bench_upload_async(pl_gpu gpu, pl_tex tex) +{ + REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params( + .tex = tex, + .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096), + .callback = dummy_cb, + ))); +} + +int main() +{ + setbuf(stdout, NULL); + setbuf(stderr, NULL); + + pl_log log = pl_log_create(PL_API_VER, pl_log_params( + .log_cb = isatty(fileno(stdout)) ? pl_log_color : pl_log_simple, + .log_level = PL_LOG_WARN, + )); + + pl_vulkan vk = pl_vulkan_create(log, pl_vulkan_params( + .allow_software = true, + .async_transfer = ASYNC_TX, + .async_compute = ASYNC_COMP, + .queue_count = NUM_QUEUES, + )); + + if (!vk) + return SKIP; + +#define BENCH_SH(fn) &(struct bench) { .run_sh = fn } +#define BENCH_TEX(fn) &(struct bench) { .run_tex = fn } + + printf("= Running benchmarks =\n"); + benchmark(vk->gpu, "tex_download ptr", BENCH_TEX(bench_download)); + benchmark(vk->gpu, "tex_download ptr async", BENCH_TEX(bench_download_async)); + benchmark(vk->gpu, "tex_upload ptr", BENCH_TEX(bench_upload)); + benchmark(vk->gpu, "tex_upload ptr async", BENCH_TEX(bench_upload_async)); + benchmark(vk->gpu, "bilinear", BENCH_SH(bench_bilinear)); + benchmark(vk->gpu, "bicubic", BENCH_SH(bench_bicubic)); + benchmark(vk->gpu, "hermite", BENCH_SH(bench_hermite)); + benchmark(vk->gpu, "gaussian", BENCH_SH(bench_gaussian)); + benchmark(vk->gpu, "deband", BENCH_SH(bench_deband)); + benchmark(vk->gpu, "deband_heavy", BENCH_SH(bench_deband_heavy)); + + // Deinterlacing + benchmark(vk->gpu, "weave", BENCH_SH(bench_weave)); + benchmark(vk->gpu, "bob", BENCH_SH(bench_bob)); + benchmark(vk->gpu, "yadif", BENCH_SH(bench_yadif)); + + // Polar sampling + benchmark(vk->gpu, "polar", BENCH_SH(bench_polar)); + if (vk->gpu->glsl.compute) + benchmark(vk->gpu, "polar_nocompute", BENCH_SH(bench_polar_nocompute)); + + // Dithering algorithms + benchmark(vk->gpu, "dither_blue", BENCH_SH(bench_dither_blue)); + benchmark(vk->gpu, "dither_white", BENCH_SH(bench_dither_white)); + benchmark(vk->gpu, "dither_ordered_fixed", BENCH_SH(bench_dither_ordered_fix)); + + // HDR peak detection + if (vk->gpu->glsl.compute) { + benchmark(vk->gpu, "hdr_peakdetect", BENCH_SH(bench_hdr_peak)); + benchmark(vk->gpu, "hdr_peakdetect_hq", BENCH_SH(bench_hdr_peak_hq)); + } + + // Tone mapping + benchmark(vk->gpu, "hdr_lut", BENCH_SH(bench_hdr_lut)); + benchmark(vk->gpu, "hdr_clip", BENCH_SH(bench_hdr_clip)); + + // Misc stuff + benchmark(vk->gpu, "av1_grain", BENCH_SH(bench_av1_grain)); + benchmark(vk->gpu, "av1_grain_lap", BENCH_SH(bench_av1_grain_lap)); + benchmark(vk->gpu, "h274_grain", BENCH_SH(bench_h274_grain)); + benchmark(vk->gpu, "reshape_poly", BENCH_SH(bench_reshape_poly)); + benchmark(vk->gpu, "reshape_mmr", BENCH_SH(bench_reshape_mmr)); + + pl_vulkan_destroy(&vk); + pl_log_destroy(&log); + return 0; +} diff --git a/src/tests/cache.c b/src/tests/cache.c new file mode 100644 index 0000000..667435d --- /dev/null +++ b/src/tests/cache.c @@ -0,0 +1,215 @@ +#include "tests.h" + +#include <libplacebo/cache.h> + +// Returns "foo" for even keys, "bar" for odd +static pl_cache_obj lookup_foobar(void *priv, uint64_t key) +{ + return (pl_cache_obj) { + .key = 0xFFFF, // test key sanity + .data = (key & 1) ? "bar" : "foo", + .size = 3, + }; +} + +static void update_count(void *priv, pl_cache_obj obj) +{ + int *count = priv; + *count += obj.size ? 1 : -1; +} + +enum { + KEY1 = 0x9c65575f419288f5, + KEY2 = 0x92da969be9b88086, + KEY3 = 0x7fcb62540b00bc8b, + KEY4 = 0x46c60ec11af9dde3, + KEY5 = 0xcb6760b98ece2477, + KEY6 = 0xf37dc72b7f9e5c88, + KEY7 = 0x30c18c962d82e5f5, +}; + +int main() +{ + pl_log log = pl_test_logger(); + pl_cache test = pl_cache_create(pl_cache_params( + .log = log, + .max_object_size = 16, + .max_total_size = 32, + )); + + pl_cache_obj obj1 = { .key = KEY1, .data = "abc", .size = 3 }; + pl_cache_obj obj2 = { .key = KEY2, .data = "de", .size = 2 }; + pl_cache_obj obj3 = { .key = KEY3, .data = "xyzw", .size = 4 }; + + REQUIRE(pl_cache_try_set(test, &obj1)); + REQUIRE(pl_cache_try_set(test, &obj2)); + REQUIRE(pl_cache_try_set(test, &obj3)); + REQUIRE_CMP(pl_cache_size(test), ==, 9, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d"); + REQUIRE(pl_cache_try_set(test, &obj2)); // delete KEY2 + REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d"); + + REQUIRE(pl_cache_get(test, &obj1)); + REQUIRE(!pl_cache_get(test, &obj2)); + REQUIRE(pl_cache_get(test, &obj3)); + REQUIRE_CMP(pl_cache_size(test), ==, 0, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 0, "d"); + REQUIRE_MEMEQ(obj1.data, "abc", 3); + REQUIRE_MEMEQ(obj3.data, "xyzw", 4); + + // Re-insert removed objects (in reversed order) + REQUIRE(pl_cache_try_set(test, &obj3)); + REQUIRE(pl_cache_try_set(test, &obj1)); + REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d"); + + uint8_t ref[72]; + memset(ref, 0xbe, sizeof(ref)); + uint8_t *refp = ref; + +#define PAD_ALIGN(x) PL_ALIGN2(x, sizeof(uint32_t)) +#define W(type, ...) \ + do { \ + size_t sz = sizeof((type){__VA_ARGS__}); \ + pl_assert(ref + sizeof(ref) - refp >= sz); \ + memcpy(refp, &(type){__VA_ARGS__}, sz); \ + refp += sz; \ + size_t pad_sz = PAD_ALIGN(sz) - sz; \ + pl_assert(ref + sizeof(ref) - refp >= pad_sz); \ + memcpy(refp, &(char[PAD_ALIGN(1)]){0}, pad_sz); \ + refp += pad_sz; \ + } while (0) + + W(char[], 'p', 'l', '_', 'c', 'a', 'c', 'h', 'e'); // cache magic + W(uint32_t, 1); // cache version + W(uint32_t, 2); // number of objects + + // object 3 + W(uint64_t, KEY3); // key + W(uint64_t, 4); // size +#ifdef PL_HAVE_XXHASH + W(uint64_t, 0xd43612ef3fbee8be); // hash +#else + W(uint64_t, 0xec18884e5e471117); // hash +#endif + W(char[], 'x', 'y', 'z', 'w'); // data + + // object 1 + W(uint64_t, KEY1); // key + W(uint64_t, 3); // size +#ifdef PL_HAVE_XXHASH + W(uint64_t, 0x78af5f94892f3950); // hash +#else + W(uint64_t, 0x3a204d408a2e2d77); // hash +#endif + W(char[], 'a', 'b', 'c'); // data + +#undef W +#undef PAD_ALIGN + + uint8_t data[100]; + pl_static_assert(sizeof(data) >= sizeof(ref)); + REQUIRE_CMP(pl_cache_save(test, data, sizeof(data)), ==, sizeof(ref), "zu"); + REQUIRE_MEMEQ(data, ref, sizeof(ref)); + + pl_cache test2 = pl_cache_create(pl_cache_params( .log = log )); + REQUIRE_CMP(pl_cache_load(test2, data, sizeof(data)), ==, 2, "d"); + REQUIRE_CMP(pl_cache_size(test2), ==, 7, "zu"); + REQUIRE_CMP(pl_cache_save(test2, NULL, 0), ==, sizeof(ref), "zu"); + REQUIRE_CMP(pl_cache_save(test2, data, sizeof(data)), ==, sizeof(ref), "zu"); + REQUIRE_MEMEQ(data, ref, sizeof(ref)); + + // Test loading invalid data + REQUIRE_CMP(pl_cache_load(test2, ref, 0), <, 0, "d"); // empty file + REQUIRE_CMP(pl_cache_load(test2, ref, 5), <, 0, "d"); // truncated header + REQUIRE_CMP(pl_cache_load(test2, ref, 64), ==, 1, "d"); // truncated object data + data[sizeof(ref) - 2] = 'X'; // corrupt data + REQUIRE_CMP(pl_cache_load(test2, data, sizeof(ref)), ==, 1, "d"); // bad checksum + pl_cache_destroy(&test2); + + // Inserting too large object should fail + uint8_t zero[32] = {0}; + pl_cache_obj obj4 = { .key = KEY4, .data = zero, .size = 32 }; + REQUIRE(!pl_cache_try_set(test, &obj4)); + REQUIRE(!pl_cache_get(test, &obj4)); + REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d"); + + // Inserting 16-byte object should succeed, and not purge old entries + obj4 = (pl_cache_obj) { .key = KEY4, .data = zero, .size = 16 }; + REQUIRE(pl_cache_try_set(test, &obj4)); + REQUIRE_CMP(pl_cache_size(test), ==, 23, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d"); + REQUIRE(pl_cache_get(test, &obj1)); + REQUIRE(pl_cache_get(test, &obj3)); + REQUIRE(pl_cache_get(test, &obj4)); + pl_cache_set(test, &obj1); + pl_cache_set(test, &obj3); + pl_cache_set(test, &obj4); + REQUIRE_CMP(pl_cache_size(test), ==, 23, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d"); + + // Inserting another 10-byte object should purge entry KEY1 + pl_cache_obj obj5 = { .key = KEY5, .data = zero, .size = 10 }; + REQUIRE(pl_cache_try_set(test, &obj5)); + REQUIRE_CMP(pl_cache_size(test), ==, 30, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d"); + REQUIRE(!pl_cache_get(test, &obj1)); + REQUIRE(pl_cache_get(test, &obj3)); + REQUIRE(pl_cache_get(test, &obj4)); + REQUIRE(pl_cache_get(test, &obj5)); + pl_cache_set(test, &obj3); + pl_cache_set(test, &obj4); + pl_cache_set(test, &obj5); + REQUIRE_CMP(pl_cache_size(test), ==, 30, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d"); + + // Inserting final 6-byte object should purge entry KEY3 + pl_cache_obj obj6 = { .key = KEY6, .data = zero, .size = 6 }; + REQUIRE(pl_cache_try_set(test, &obj6)); + REQUIRE_CMP(pl_cache_size(test), ==, 32, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d"); + REQUIRE(!pl_cache_get(test, &obj3)); + REQUIRE(pl_cache_get(test, &obj4)); + REQUIRE(pl_cache_get(test, &obj5)); + REQUIRE(pl_cache_get(test, &obj6)); + REQUIRE_CMP(pl_cache_size(test), ==, 0, "zu"); + REQUIRE_CMP(pl_cache_objects(test), ==, 0, "d"); + pl_cache_obj_free(&obj4); + pl_cache_obj_free(&obj5); + pl_cache_obj_free(&obj6); + + // Test callback API + int num_objects = 0; + test2 = pl_cache_create(pl_cache_params( + .get = lookup_foobar, + .set = update_count, + .priv = &num_objects, + )); + + REQUIRE(pl_cache_get(test2, &obj1)); + REQUIRE_CMP(obj1.key, ==, KEY1, PRIu64); + REQUIRE_CMP(obj1.size, ==, 3, "zu"); + REQUIRE_MEMEQ(obj1.data, "bar", 3); + REQUIRE(pl_cache_get(test2, &obj2)); + REQUIRE_CMP(obj2.key, ==, KEY2, PRIu64); + REQUIRE_CMP(obj2.size, ==, 3, "zu"); + REQUIRE_MEMEQ(obj2.data, "foo", 3); + REQUIRE_CMP(pl_cache_objects(test2), ==, 0, "d"); + REQUIRE_CMP(num_objects, ==, 0, "d"); + REQUIRE(pl_cache_try_set(test2, &obj1)); + REQUIRE(pl_cache_try_set(test2, &obj2)); + REQUIRE(pl_cache_try_set(test2, &(pl_cache_obj) { .key = KEY7, .data = "abcde", .size = 5 })); + REQUIRE_CMP(pl_cache_objects(test2), ==, 3, "d"); + REQUIRE_CMP(num_objects, ==, 3, "d"); + REQUIRE(pl_cache_try_set(test2, &obj1)); + REQUIRE(pl_cache_try_set(test2, &obj2)); + REQUIRE_CMP(pl_cache_objects(test2), ==, 1, "d"); + REQUIRE_CMP(num_objects, ==, 1, "d"); + pl_cache_destroy(&test2); + + pl_cache_destroy(&test); + pl_log_destroy(&log); + return 0; +} diff --git a/src/tests/colorspace.c b/src/tests/colorspace.c new file mode 100644 index 0000000..4b0662b --- /dev/null +++ b/src/tests/colorspace.c @@ -0,0 +1,488 @@ +#include "tests.h" + +int main() +{ + for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) { + bool ycbcr = sys >= PL_COLOR_SYSTEM_BT_601 && sys <= PL_COLOR_SYSTEM_YCGCO; + REQUIRE_CMP(ycbcr, ==, pl_color_system_is_ycbcr_like(sys), "d"); + } + + for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) { + bool hdr = trc >= PL_COLOR_TRC_PQ && trc <= PL_COLOR_TRC_S_LOG2; + REQUIRE_CMP(hdr, ==, pl_color_transfer_is_hdr(trc), "d"); + REQUIRE_CMP(pl_color_transfer_nominal_peak(trc), >=, 1.0, "f"); + } + + float pq_peak = pl_color_transfer_nominal_peak(PL_COLOR_TRC_PQ); + REQUIRE_FEQ(PL_COLOR_SDR_WHITE * pq_peak, 10000, 1e-7); + + struct pl_color_repr tv_repr = { + .sys = PL_COLOR_SYSTEM_BT_709, + .levels = PL_COLOR_LEVELS_LIMITED, + }; + + struct pl_color_repr pc_repr = { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_FULL, + }; + + // Ensure this is a no-op for bits == bits + for (int bits = 1; bits <= 16; bits++) { + tv_repr.bits.color_depth = tv_repr.bits.sample_depth = bits; + pc_repr.bits.color_depth = pc_repr.bits.sample_depth = bits; + REQUIRE_FEQ(pl_color_repr_normalize(&tv_repr), 1.0, 1e-7); + REQUIRE_FEQ(pl_color_repr_normalize(&pc_repr), 1.0, 1e-7); + } + + tv_repr.bits.color_depth = 8; + tv_repr.bits.sample_depth = 10; + float tv8to10 = pl_color_repr_normalize(&tv_repr); + + tv_repr.bits.color_depth = 8; + tv_repr.bits.sample_depth = 12; + float tv8to12 = pl_color_repr_normalize(&tv_repr); + + // Simulate the effect of GPU texture sampling on UNORM texture + REQUIRE_FEQ(tv8to10 * 16 /1023., 64/1023., 1e-7); // black + REQUIRE_FEQ(tv8to10 * 235/1023., 940/1023., 1e-7); // nominal white + REQUIRE_FEQ(tv8to10 * 128/1023., 512/1023., 1e-7); // achromatic + REQUIRE_FEQ(tv8to10 * 240/1023., 960/1023., 1e-7); // nominal chroma peak + + REQUIRE_FEQ(tv8to12 * 16 /4095., 256 /4095., 1e-7); // black + REQUIRE_FEQ(tv8to12 * 235/4095., 3760/4095., 1e-7); // nominal white + REQUIRE_FEQ(tv8to12 * 128/4095., 2048/4095., 1e-7); // achromatic + REQUIRE_FEQ(tv8to12 * 240/4095., 3840/4095., 1e-7); // nominal chroma peak + + // Ensure lavc's xyz12 is handled correctly + struct pl_color_repr xyz12 = { + .sys = PL_COLOR_SYSTEM_XYZ, + .levels = PL_COLOR_LEVELS_UNKNOWN, + .bits = { + .sample_depth = 16, + .color_depth = 12, + .bit_shift = 4, + }, + }; + + float xyz = pl_color_repr_normalize(&xyz12); + REQUIRE_FEQ(xyz * (4095 << 4), 65535, 1e-7); + + // Assume we uploaded a 10-bit source directly (unshifted) as a 16-bit + // texture. This texture multiplication factor should make it behave as if + // it was uploaded as a 10-bit texture instead. + pc_repr.bits.color_depth = 10; + pc_repr.bits.sample_depth = 16; + float pc10to16 = pl_color_repr_normalize(&pc_repr); + REQUIRE_FEQ(pc10to16 * 1000/65535., 1000/1023., 1e-7); + + const struct pl_raw_primaries *bt709, *bt2020, *dcip3; + bt709 = pl_raw_primaries_get(PL_COLOR_PRIM_BT_709); + bt2020 = pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020); + dcip3 = pl_raw_primaries_get(PL_COLOR_PRIM_DCI_P3); + REQUIRE(pl_primaries_superset(bt2020, bt709)); + REQUIRE(!pl_primaries_superset(bt2020, dcip3)); // small region doesn't overlap + REQUIRE(pl_primaries_superset(dcip3, bt709)); + REQUIRE(!pl_primaries_superset(bt709, bt2020)); + REQUIRE(pl_primaries_compatible(bt2020, bt2020)); + REQUIRE(pl_primaries_compatible(bt2020, bt709)); + REQUIRE(pl_primaries_compatible(bt709, bt2020)); + REQUIRE(pl_primaries_compatible(bt2020, dcip3)); + REQUIRE(pl_primaries_compatible(bt709, dcip3)); + + struct pl_raw_primaries bt709_2020 = pl_primaries_clip(bt709, bt2020); + struct pl_raw_primaries bt2020_709 = pl_primaries_clip(bt2020, bt709); + REQUIRE(pl_raw_primaries_similar(&bt709_2020, bt709)); + REQUIRE(pl_raw_primaries_similar(&bt2020_709, bt709)); + + struct pl_raw_primaries dcip3_bt2020 = pl_primaries_clip(dcip3, bt2020); + struct pl_raw_primaries dcip3_bt709 = pl_primaries_clip(dcip3, bt709); + REQUIRE(pl_primaries_superset(dcip3, &dcip3_bt2020)); + REQUIRE(pl_primaries_superset(dcip3, &dcip3_bt709)); + REQUIRE(pl_primaries_superset(bt2020, &dcip3_bt2020)); + REQUIRE(pl_primaries_superset(bt709, &dcip3_bt709)); + + pl_matrix3x3 rgb2xyz, rgb2xyz_; + rgb2xyz = rgb2xyz_ = pl_get_rgb2xyz_matrix(bt709); + pl_matrix3x3_invert(&rgb2xyz_); + pl_matrix3x3_invert(&rgb2xyz_); + + // Make sure the double-inversion round trips + for (int y = 0; y < 3; y++) { + for (int x = 0; x < 3; x++) + REQUIRE_FEQ(rgb2xyz.m[y][x], rgb2xyz_.m[y][x], 1e-6); + } + + // Make sure mapping the spectral RGB colors (i.e. the matrix rows) matches + // our original primaries + float Y = rgb2xyz.m[1][0]; + REQUIRE_FEQ(rgb2xyz.m[0][0], pl_cie_X(bt709->red) * Y, 1e-7); + REQUIRE_FEQ(rgb2xyz.m[2][0], pl_cie_Z(bt709->red) * Y, 1e-7); + Y = rgb2xyz.m[1][1]; + REQUIRE_FEQ(rgb2xyz.m[0][1], pl_cie_X(bt709->green) * Y, 1e-7); + REQUIRE_FEQ(rgb2xyz.m[2][1], pl_cie_Z(bt709->green) * Y, 1e-7); + Y = rgb2xyz.m[1][2]; + REQUIRE_FEQ(rgb2xyz.m[0][2], pl_cie_X(bt709->blue) * Y, 1e-7); + REQUIRE_FEQ(rgb2xyz.m[2][2], pl_cie_Z(bt709->blue) * Y, 1e-7); + + // Make sure the gamut mapping round-trips + pl_matrix3x3 bt709_bt2020, bt2020_bt709; + bt709_bt2020 = pl_get_color_mapping_matrix(bt709, bt2020, PL_INTENT_RELATIVE_COLORIMETRIC); + bt2020_bt709 = pl_get_color_mapping_matrix(bt2020, bt709, PL_INTENT_RELATIVE_COLORIMETRIC); + for (int n = 0; n < 10; n++) { + float vec[3] = { RANDOM, RANDOM, RANDOM }; + float dst[3] = { vec[0], vec[1], vec[2] }; + pl_matrix3x3_apply(&bt709_bt2020, dst); + pl_matrix3x3_apply(&bt2020_bt709, dst); + for (int i = 0; i < 3; i++) + REQUIRE_FEQ(dst[i], vec[i], 1e-6); + } + + // Ensure the decoding matrix round-trips to white/black + for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) { + if (!pl_color_system_is_linear(sys)) + continue; + + printf("testing color system %u\n", (unsigned) sys); + struct pl_color_repr repr = { + .levels = PL_COLOR_LEVELS_LIMITED, + .sys = sys, + .bits = { + // synthetic test + .color_depth = 8, + .sample_depth = 10, + }, + }; + + float scale = pl_color_repr_normalize(&repr); + pl_transform3x3 yuv2rgb = pl_color_repr_decode(&repr, NULL); + pl_matrix3x3_scale(&yuv2rgb.mat, scale); + + static const float white_ycbcr[3] = { 235/1023., 128/1023., 128/1023. }; + static const float black_ycbcr[3] = { 16/1023., 128/1023., 128/1023. }; + static const float white_other[3] = { 235/1023., 235/1023., 235/1023. }; + static const float black_other[3] = { 16/1023., 16/1023., 16/1023. }; + + float white[3], black[3]; + for (int i = 0; i < 3; i++) { + if (pl_color_system_is_ycbcr_like(sys)) { + white[i] = white_ycbcr[i]; + black[i] = black_ycbcr[i]; + } else { + white[i] = white_other[i]; + black[i] = black_other[i]; + } + } + + pl_transform3x3_apply(&yuv2rgb, white); + REQUIRE_FEQ(white[0], 1.0, 1e-6); + REQUIRE_FEQ(white[1], 1.0, 1e-6); + REQUIRE_FEQ(white[2], 1.0, 1e-6); + + pl_transform3x3_apply(&yuv2rgb, black); + REQUIRE_FEQ(black[0], 0.0, 1e-6); + REQUIRE_FEQ(black[1], 0.0, 1e-6); + REQUIRE_FEQ(black[2], 0.0, 1e-6); + } + + // Make sure chromatic adaptation works + struct pl_raw_primaries bt709_d50; + bt709_d50 = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709); + bt709_d50.white = (struct pl_cie_xy) { 0.34567, 0.35850 }; + + pl_matrix3x3 d50_d65; + d50_d65 = pl_get_color_mapping_matrix(&bt709_d50, bt709, PL_INTENT_RELATIVE_COLORIMETRIC); + + float white[3] = { 1.0, 1.0, 1.0 }; + pl_matrix3x3_apply(&d50_d65, white); + REQUIRE_FEQ(white[0], 1.0, 1e-6); + REQUIRE_FEQ(white[1], 1.0, 1e-6); + REQUIRE_FEQ(white[2], 1.0, 1e-6); + + // Simulate a typical 10-bit YCbCr -> 16 bit texture conversion + tv_repr.bits.color_depth = 10; + tv_repr.bits.sample_depth = 16; + pl_transform3x3 yuv2rgb; + yuv2rgb = pl_color_repr_decode(&tv_repr, NULL); + float test[3] = { 575/65535., 336/65535., 640/65535. }; + pl_transform3x3_apply(&yuv2rgb, test); + REQUIRE_FEQ(test[0], 0.808305, 1e-6); + REQUIRE_FEQ(test[1], 0.553254, 1e-6); + REQUIRE_FEQ(test[2], 0.218841, 1e-6); + + // DVD + REQUIRE_CMP(pl_color_system_guess_ycbcr(720, 480), ==, PL_COLOR_SYSTEM_BT_601, "u"); + REQUIRE_CMP(pl_color_system_guess_ycbcr(720, 576), ==, PL_COLOR_SYSTEM_BT_601, "u"); + REQUIRE_CMP(pl_color_primaries_guess(720, 576), ==, PL_COLOR_PRIM_BT_601_625, "u"); + REQUIRE_CMP(pl_color_primaries_guess(720, 480), ==, PL_COLOR_PRIM_BT_601_525, "u"); + // PAL 16:9 + REQUIRE_CMP(pl_color_system_guess_ycbcr(1024, 576), ==, PL_COLOR_SYSTEM_BT_601, "u"); + REQUIRE_CMP(pl_color_primaries_guess(1024, 576), ==, PL_COLOR_PRIM_BT_601_625, "u"); + // HD + REQUIRE_CMP(pl_color_system_guess_ycbcr(1280, 720), ==, PL_COLOR_SYSTEM_BT_709, "u"); + REQUIRE_CMP(pl_color_system_guess_ycbcr(1920, 1080), ==, PL_COLOR_SYSTEM_BT_709, "u"); + REQUIRE_CMP(pl_color_primaries_guess(1280, 720), ==, PL_COLOR_PRIM_BT_709, "u"); + REQUIRE_CMP(pl_color_primaries_guess(1920, 1080), ==, PL_COLOR_PRIM_BT_709, "u"); + + // Odd/weird videos + REQUIRE_CMP(pl_color_primaries_guess(2000, 576), ==, PL_COLOR_PRIM_BT_709, "u"); + REQUIRE_CMP(pl_color_primaries_guess(200, 200), ==, PL_COLOR_PRIM_BT_709, "u"); + + REQUIRE(pl_color_repr_equal(&pl_color_repr_sdtv, &pl_color_repr_sdtv)); + REQUIRE(!pl_color_repr_equal(&pl_color_repr_sdtv, &pl_color_repr_hdtv)); + + struct pl_color_repr repr = pl_color_repr_unknown; + pl_color_repr_merge(&repr, &pl_color_repr_uhdtv); + REQUIRE(pl_color_repr_equal(&repr, &pl_color_repr_uhdtv)); + + REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_UNKNOWN)); + REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_601_525)); + REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_601_625)); + REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_709)); + REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_470M)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_2020)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_APPLE)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_ADOBE)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_PRO_PHOTO)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_CIE_1931)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_DCI_P3)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_DISPLAY_P3)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_V_GAMUT)); + REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_S_GAMUT)); + + struct pl_color_space space = pl_color_space_unknown; + pl_color_space_merge(&space, &pl_color_space_bt709); + REQUIRE(pl_color_space_equal(&space, &pl_color_space_bt709)); + + // Infer some color spaces + struct pl_color_space hlg = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_HLG, + }; + + pl_color_space_infer(&hlg); + REQUIRE_CMP(hlg.hdr.max_luma, ==, PL_COLOR_HLG_PEAK, "f"); + + struct pl_color_space unknown = {0}; + struct pl_color_space display = { + .primaries = PL_COLOR_PRIM_BT_709, + .transfer = PL_COLOR_TRC_BT_1886, + }; + + pl_color_space_infer(&unknown); + pl_color_space_infer(&display); + REQUIRE(pl_color_space_equal(&unknown, &display)); + + float x, y; + pl_chroma_location_offset(PL_CHROMA_LEFT, &x, &y); + REQUIRE_CMP(x, ==, -0.5f, "f"); + REQUIRE_CMP(y, ==, 0.0f, "f"); + pl_chroma_location_offset(PL_CHROMA_TOP_LEFT, &x, &y); + REQUIRE_CMP(x, ==, -0.5f, "f"); + REQUIRE_CMP(y, ==, -0.5f, "f"); + pl_chroma_location_offset(PL_CHROMA_CENTER, &x, &y); + REQUIRE_CMP(x, ==, 0.0f, "f"); + REQUIRE_CMP(y, ==, 0.0f, "f"); + pl_chroma_location_offset(PL_CHROMA_BOTTOM_CENTER, &x, &y); + REQUIRE_CMP(x, ==, 0.0f, "f"); + REQUIRE_CMP(y, ==, 0.5f, "f"); + + REQUIRE_CMP(pl_raw_primaries_get(PL_COLOR_PRIM_UNKNOWN), ==, + pl_raw_primaries_get(PL_COLOR_PRIM_BT_709), "p"); + + // Color blindness tests + float red[3] = { 1.0, 0.0, 0.0 }; + float green[3] = { 0.0, 1.0, 0.0 }; + float blue[3] = { 0.0, 0.0, 1.0 }; + +#define TEST_CONE(model, color) \ + do { \ + float tmp[3] = { (color)[0], (color)[1], (color)[2] }; \ + pl_matrix3x3 mat = pl_get_cone_matrix(&(model), bt709); \ + pl_matrix3x3_apply(&mat, tmp); \ + printf("%s + %s = %f %f %f\n", #model, #color, tmp[0], tmp[1], tmp[2]); \ + for (int i = 0; i < 3; i++) \ + REQUIRE_FEQ((color)[i], tmp[i], 1e-5f); \ + } while(0) + + struct pl_cone_params red_only = { .cones = PL_CONE_MS }; + struct pl_cone_params green_only = { .cones = PL_CONE_LS }; + struct pl_cone_params blue_only = pl_vision_monochromacy; + + // These models should all round-trip white + TEST_CONE(pl_vision_normal, white); + TEST_CONE(pl_vision_protanopia, white); + TEST_CONE(pl_vision_protanomaly, white); + TEST_CONE(pl_vision_deuteranomaly, white); + TEST_CONE(pl_vision_tritanomaly, white); + TEST_CONE(pl_vision_achromatopsia, white); + TEST_CONE(red_only, white); + TEST_CONE(green_only, white); + TEST_CONE(blue_only, white); + + // These models should round-trip blue + TEST_CONE(pl_vision_normal, blue); + TEST_CONE(pl_vision_protanomaly, blue); + TEST_CONE(pl_vision_deuteranomaly, blue); + + // These models should round-trip red + TEST_CONE(pl_vision_normal, red); + TEST_CONE(pl_vision_tritanomaly, red); + TEST_CONE(pl_vision_tritanopia, red); + + // These models should round-trip green + TEST_CONE(pl_vision_normal, green); + + // Color adaptation tests + struct pl_cie_xy d65 = pl_white_from_temp(6504); + REQUIRE_FEQ(d65.x, 0.31271, 1e-3); + REQUIRE_FEQ(d65.y, 0.32902, 1e-3); + struct pl_cie_xy d55 = pl_white_from_temp(5503); + REQUIRE_FEQ(d55.x, 0.33242, 1e-3); + REQUIRE_FEQ(d55.y, 0.34743, 1e-3); + + // Make sure we infer the correct set of metadata parameters +#define TEST_METADATA(CSP, TYPE, MIN, MAX, AVG) \ + do { \ + float _min, _max, _avg; \ + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( \ + .color = &(CSP), \ + .metadata = TYPE, \ + .scaling = PL_HDR_PQ, \ + .out_min = &_min, \ + .out_max = &_max, \ + .out_avg = &_avg, \ + )); \ + const float _min_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, MIN); \ + const float _max_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, MAX); \ + const float _avg_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, AVG); \ + REQUIRE_FEQ(_min, _min_ref, 1e-5); \ + REQUIRE_FEQ(_max, _max_ref, 1e-5); \ + REQUIRE_FEQ(_avg, _avg_ref, 1e-5); \ + } while (0) + + const struct pl_color_space hdr10plus = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_PQ, + .hdr = { + .min_luma = 0.005, + .max_luma = 4000, + .scene_max = {596.69, 1200, 500}, + .scene_avg = 300, + }, + }; + + REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_ANY)); + REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_NONE)); + REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_HDR10)); + REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_HDR10PLUS)); + REQUIRE(!pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_CIE_Y)); + + TEST_METADATA(hdr10plus, PL_HDR_METADATA_NONE, PL_COLOR_HDR_BLACK, 10000, 0); + TEST_METADATA(hdr10plus, PL_HDR_METADATA_CIE_Y, PL_COLOR_HDR_BLACK, 4000, 0); + TEST_METADATA(hdr10plus, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 4000, 0); + TEST_METADATA(hdr10plus, PL_HDR_METADATA_HDR10PLUS, PL_COLOR_HDR_BLACK, 1000, 250); + TEST_METADATA(hdr10plus, PL_HDR_METADATA_ANY, PL_COLOR_HDR_BLACK, 1000, 250); + + const struct pl_color_space dovi = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_PQ, + .hdr = { + .min_luma = 0.005, + .max_luma = 4000, + .max_pq_y = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, 1000), + .avg_pq_y = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, 250), + }, + }; + + REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_ANY)); + REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_NONE)); + REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_HDR10)); + REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_CIE_Y)); + REQUIRE(!pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_HDR10PLUS)); + + TEST_METADATA(dovi, PL_HDR_METADATA_NONE, PL_COLOR_HDR_BLACK, 10000, 0); + TEST_METADATA(dovi, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 4000, 0); + TEST_METADATA(dovi, PL_HDR_METADATA_HDR10PLUS, PL_COLOR_HDR_BLACK, 4000, 0); + TEST_METADATA(dovi, PL_HDR_METADATA_CIE_Y, PL_COLOR_HDR_BLACK, 1000, 250); + TEST_METADATA(dovi, PL_HDR_METADATA_ANY, PL_COLOR_HDR_BLACK, 1000, 250); + + const struct pl_color_space hlg4000 = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_HLG, + .hdr.max_luma = 4000, + .hdr.min_luma = 0.005, + }; + + TEST_METADATA(hlg4000, PL_HDR_METADATA_NONE, PL_COLOR_HDR_BLACK, PL_COLOR_HLG_PEAK, 0); + TEST_METADATA(hlg4000, PL_HDR_METADATA_HDR10, 0.005, 4000, 0); + TEST_METADATA(hlg4000, PL_HDR_METADATA_ANY, 0.005, 4000, 0); + + const struct pl_color_space untagged = { + .primaries = PL_COLOR_PRIM_BT_709, + .transfer = PL_COLOR_TRC_BT_1886, + }; + + REQUIRE(pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_NONE)); + REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_ANY)); + REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_HDR10)); + REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_CIE_Y)); + REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_HDR10PLUS)); + + const float sdr_black = PL_COLOR_SDR_WHITE / PL_COLOR_SDR_CONTRAST; + TEST_METADATA(untagged, PL_HDR_METADATA_NONE, sdr_black, PL_COLOR_SDR_WHITE, 0); + TEST_METADATA(untagged, PL_HDR_METADATA_ANY, sdr_black, PL_COLOR_SDR_WHITE, 0); + + const struct pl_color_space sdr50 = { + .primaries = PL_COLOR_PRIM_BT_709, + .transfer = PL_COLOR_TRC_BT_1886, + .hdr.max_luma = 50, + }; + + REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_NONE)); + REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_ANY)); + REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_HDR10)); + REQUIRE(!pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_CIE_Y)); + REQUIRE(!pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_HDR10PLUS)); + + TEST_METADATA(sdr50, PL_HDR_METADATA_NONE, sdr_black, PL_COLOR_SDR_WHITE, 0); + TEST_METADATA(sdr50, PL_HDR_METADATA_HDR10, 50 / PL_COLOR_SDR_CONTRAST, 50, 0); + TEST_METADATA(sdr50, PL_HDR_METADATA_ANY, 50 / PL_COLOR_SDR_CONTRAST, 50, 0); + + const struct pl_color_space sdr10k = { + .primaries = PL_COLOR_PRIM_BT_709, + .transfer = PL_COLOR_TRC_BT_1886, + .hdr.min_luma = PL_COLOR_SDR_WHITE / 10000, + }; + + REQUIRE(pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_NONE)); + REQUIRE(!pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_ANY)); + REQUIRE(!pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_HDR10)); + TEST_METADATA(sdr10k, PL_HDR_METADATA_NONE, sdr_black, PL_COLOR_SDR_WHITE, 0); + TEST_METADATA(sdr10k, PL_HDR_METADATA_HDR10, PL_COLOR_SDR_WHITE / 10000, PL_COLOR_SDR_WHITE, 0); + TEST_METADATA(sdr10k, PL_HDR_METADATA_ANY, PL_COLOR_SDR_WHITE / 10000, PL_COLOR_SDR_WHITE, 0); + + const struct pl_color_space bogus_vals = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_HLG, + .hdr.min_luma = 1e-9, + .hdr.max_luma = 1000000, + }; + + const struct pl_color_space bogus_flip = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_PQ, + .hdr.min_luma = 4000, + .hdr.max_luma = 0.05, + }; + + const struct pl_color_space bogus_sign = { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_HLG, + .hdr.min_luma = -0.5, + .hdr.max_luma = -4000, + }; + + TEST_METADATA(bogus_vals, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 10000, 0); + TEST_METADATA(bogus_flip, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 10000, 0); + TEST_METADATA(bogus_sign, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, PL_COLOR_HLG_PEAK, 0); +} diff --git a/src/tests/common.c b/src/tests/common.c new file mode 100644 index 0000000..849971e --- /dev/null +++ b/src/tests/common.c @@ -0,0 +1,136 @@ +#include "tests.h" + +static int irand() +{ + return rand() - RAND_MAX / 2; +} + +int main() +{ + pl_log log = pl_test_logger(); + pl_log_update(log, NULL); + pl_log_destroy(&log); + + // Test some misc helper functions + pl_rect2d rc2 = { + irand(), irand(), + irand(), irand(), + }; + + pl_rect3d rc3 = { + irand(), irand(), irand(), + irand(), irand(), irand(), + }; + + pl_rect2d_normalize(&rc2); + REQUIRE_CMP(rc2.x1, >=, rc2.x0, "d"); + REQUIRE_CMP(rc2.y1, >=, rc2.y0, "d"); + + pl_rect3d_normalize(&rc3); + REQUIRE_CMP(rc3.x1, >=, rc3.x0, "d"); + REQUIRE_CMP(rc3.y1, >=, rc3.y0, "d"); + REQUIRE_CMP(rc3.z1, >=, rc3.z0, "d"); + + pl_rect2df rc2f = { + RANDOM, RANDOM, + RANDOM, RANDOM, + }; + + pl_rect3df rc3f = { + RANDOM, RANDOM, RANDOM, + RANDOM, RANDOM, RANDOM, + }; + + pl_rect2df_normalize(&rc2f); + REQUIRE_CMP(rc2f.x1, >=, rc2f.x0, "f"); + REQUIRE_CMP(rc2f.y1, >=, rc2f.y0, "f"); + + pl_rect3df_normalize(&rc3f); + REQUIRE_CMP(rc3f.x1, >=, rc3f.x0, "f"); + REQUIRE_CMP(rc3f.y1, >=, rc3f.y0, "f"); + REQUIRE_CMP(rc3f.z1, >=, rc3f.z0, "f"); + + pl_rect2d rc2r = pl_rect2df_round(&rc2f); + pl_rect3d rc3r = pl_rect3df_round(&rc3f); + + REQUIRE_CMP(fabs(rc2r.x0 - rc2f.x0), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc2r.x1 - rc2f.x1), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc2r.y0 - rc2f.y0), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc2r.y1 - rc2f.y1), <=, 0.5, "f"); + + REQUIRE_CMP(fabs(rc3r.x0 - rc3f.x0), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc3r.x1 - rc3f.x1), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc3r.y0 - rc3f.y0), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc3r.y1 - rc3f.y1), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc3r.z0 - rc3f.z0), <=, 0.5, "f"); + REQUIRE_CMP(fabs(rc3r.z1 - rc3f.z1), <=, 0.5, "f"); + + pl_transform3x3 tr = { + .mat = {{ + { RANDOM, RANDOM, RANDOM }, + { RANDOM, RANDOM, RANDOM }, + { RANDOM, RANDOM, RANDOM }, + }}, + .c = { RANDOM, RANDOM, RANDOM }, + }; + + pl_transform3x3 tr2 = tr; + float scale = 1.0 + RANDOM; + pl_transform3x3_scale(&tr2, scale); + pl_transform3x3_invert(&tr2); + pl_transform3x3_invert(&tr2); + pl_transform3x3_scale(&tr2, 1.0 / scale); + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + printf("%f %f\n", tr.mat.m[i][j], tr2.mat.m[i][j]); + REQUIRE_FEQ(tr.mat.m[i][j], tr2.mat.m[i][j], 1e-4); + } + REQUIRE_FEQ(tr.c[i], tr2.c[i], 1e-4); + } + + // Test aspect ratio code + const pl_rect2df rc1080p = {0, 0, 1920, 1080}; + const pl_rect2df rc43 = {0, 0, 1024, 768}; + pl_rect2df rc; + + REQUIRE_FEQ(pl_rect2df_aspect(&rc1080p), 16.0/9.0, 1e-8); + REQUIRE_FEQ(pl_rect2df_aspect(&rc43), 4.0/3.0, 1e-8); + +#define pl_rect2df_midx(rc) (((rc).x0 + (rc).x1) / 2.0) +#define pl_rect2df_midy(rc) (((rc).y0 + (rc).y1) / 2.0) + + for (float aspect = 0.2; aspect < 3.0; aspect += 0.4) { + for (float scan = 0.0; scan <= 1.0; scan += 0.5) { + rc = rc1080p; + pl_rect2df_aspect_set(&rc, aspect, scan); + printf("aspect %.2f, panscan %.1f: {%f %f} -> {%f %f}\n", + aspect, scan, rc.x0, rc.y0, rc.x1, rc.y1); + REQUIRE_FEQ(pl_rect2df_aspect(&rc), aspect, 1e-6); + REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc1080p), 1e-6); + REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc1080p), 1e-6); + } + } + + rc = rc1080p; + pl_rect2df_aspect_fit(&rc, &rc43, 0.0); + REQUIRE_FEQ(pl_rect2df_aspect(&rc), pl_rect2df_aspect(&rc43), 1e-6); + REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc1080p), 1e-6); + REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc1080p), 1e-6); + REQUIRE_FEQ(pl_rect_w(rc), pl_rect_w(rc43), 1e-6); + REQUIRE_FEQ(pl_rect_h(rc), pl_rect_h(rc43), 1e-6); + + rc = rc43; + pl_rect2df_aspect_fit(&rc, &rc1080p, 0.0); + REQUIRE_FEQ(pl_rect2df_aspect(&rc), pl_rect2df_aspect(&rc1080p), 1e-6); + REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc43), 1e-6); + REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc43), 1e-6); + REQUIRE_FEQ(pl_rect_w(rc), pl_rect_w(rc43), 1e-6); + + rc = (pl_rect2df) { 1920, 1080, 0, 0 }; + pl_rect2df_offset(&rc, 50, 100); + REQUIRE_FEQ(rc.x0, 1870, 1e-6); + REQUIRE_FEQ(rc.x1, -50, 1e-6); + REQUIRE_FEQ(rc.y0, 980, 1e-6); + REQUIRE_FEQ(rc.y1, -100, 1e-6); +} diff --git a/src/tests/d3d11.c b/src/tests/d3d11.c new file mode 100644 index 0000000..256af35 --- /dev/null +++ b/src/tests/d3d11.c @@ -0,0 +1,59 @@ +#include "gpu_tests.h" +#include "d3d11/gpu.h" +#include <dxgi1_2.h> + +#include <libplacebo/d3d11.h> + +int main() +{ + pl_log log = pl_test_logger(); + IDXGIFactory1 *factory = NULL; + IDXGIAdapter1 *adapter1 = NULL; + HRESULT hr; + + HMODULE dxgi = LoadLibraryW(L"dxgi.dll"); + if (!dxgi) + return SKIP; + + __typeof__(&CreateDXGIFactory1) pCreateDXGIFactory1 = + (void *) GetProcAddress(dxgi, "CreateDXGIFactory1"); + if (!pCreateDXGIFactory1) + return SKIP; + + hr = pCreateDXGIFactory1(&IID_IDXGIFactory1, (void **) &factory); + if (FAILED(hr)) { + printf("Failed to create DXGI factory\n"); + return SKIP; + } + + // Test all attached devices + for (int i = 0;; i++) { + hr = IDXGIFactory1_EnumAdapters1(factory, i, &adapter1); + if (hr == DXGI_ERROR_NOT_FOUND) + break; + if (FAILED(hr)) { + printf("Failed to enumerate adapters\n"); + return SKIP; + } + + DXGI_ADAPTER_DESC1 desc; + hr = IDXGIAdapter1_GetDesc1(adapter1, &desc); + if (FAILED(hr)) { + printf("Failed to enumerate adapters\n"); + return SKIP; + } + SAFE_RELEASE(adapter1); + + const struct pl_d3d11_t *d3d11 = pl_d3d11_create(log, pl_d3d11_params( + .debug = true, + .adapter_luid = desc.AdapterLuid, + )); + REQUIRE(d3d11); + + gpu_shader_tests(d3d11->gpu); + + pl_d3d11_destroy(&d3d11); + } + + SAFE_RELEASE(factory); +} diff --git a/src/tests/dav1d.c b/src/tests/dav1d.c new file mode 100644 index 0000000..7e2439f --- /dev/null +++ b/src/tests/dav1d.c @@ -0,0 +1,45 @@ +#include "tests.h" +#include "libplacebo/utils/dav1d.h" + +int main() +{ + // Test enum functions + for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) { + // Exceptions to the rule, due to different handling in dav1d + if (sys == PL_COLOR_SYSTEM_BT_2100_HLG || sys == PL_COLOR_SYSTEM_XYZ) + continue; + + enum Dav1dMatrixCoefficients mc = pl_system_to_dav1d(sys); + enum pl_color_system sys2 = pl_system_from_dav1d(mc); + if (sys2) + REQUIRE_CMP(sys, ==, sys2, "u"); + } + + for (enum pl_color_levels lev = 0; lev < PL_COLOR_LEVELS_COUNT; lev++) { + int range = pl_levels_to_dav1d(lev); + enum pl_color_levels lev2 = pl_levels_from_dav1d(range); + if (lev != PL_COLOR_LEVELS_UNKNOWN) + REQUIRE_CMP(lev, ==, lev2, "u"); + } + + for (enum pl_color_primaries prim = 0; prim < PL_COLOR_PRIM_COUNT; prim++) { + enum Dav1dColorPrimaries dpri = pl_primaries_to_dav1d(prim); + enum pl_color_primaries prim2 = pl_primaries_from_dav1d(dpri); + if (prim2) + REQUIRE_CMP(prim, ==, prim2, "u"); + } + + for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) { + enum Dav1dTransferCharacteristics dtrc = pl_transfer_to_dav1d(trc); + enum pl_color_transfer trc2 = pl_transfer_from_dav1d(dtrc); + if (trc2) + REQUIRE_CMP(trc, ==, trc2, "u"); + } + + for (enum pl_chroma_location loc = 0; loc < PL_CHROMA_COUNT; loc++) { + enum Dav1dChromaSamplePosition dloc = pl_chroma_to_dav1d(loc); + enum pl_chroma_location loc2 = pl_chroma_from_dav1d(dloc); + if (loc2) + REQUIRE_CMP(loc, ==, loc2, "u"); + } +} diff --git a/src/tests/dither.c b/src/tests/dither.c new file mode 100644 index 0000000..c9f639c --- /dev/null +++ b/src/tests/dither.c @@ -0,0 +1,41 @@ +#include "tests.h" + +#include <libplacebo/dither.h> +#include <libplacebo/shaders/dithering.h> + +#define SHIFT 4 +#define SIZE (1 << SHIFT) +float data[SIZE][SIZE]; + +int main() +{ + printf("Ordered dither matrix:\n"); + pl_generate_bayer_matrix(&data[0][0], SIZE); + for (int y = 0; y < SIZE; y++) { + for (int x = 0; x < SIZE; x++) + printf(" %3d", (int)(data[y][x] * SIZE * SIZE)); + printf("\n"); + } + + printf("Blue noise dither matrix:\n"); + pl_generate_blue_noise(&data[0][0], SHIFT); + for (int y = 0; y < SIZE; y++) { + for (int x = 0; x < SIZE; x++) + printf(" %3d", (int)(data[y][x] * SIZE * SIZE)); + printf("\n"); + } + + // Generate an example of a dither shader + pl_log log = pl_test_logger(); + pl_shader sh = pl_shader_alloc(log, NULL); + pl_shader_obj obj = NULL; + + pl_shader_dither(sh, 8, &obj, NULL); + const struct pl_shader_res *res = pl_shader_finalize(sh); + REQUIRE(res); + printf("Generated dither shader:\n%s\n", res->glsl); + + pl_shader_obj_destroy(&obj); + pl_shader_free(&sh); + pl_log_destroy(&log); +} diff --git a/src/tests/dummy.c b/src/tests/dummy.c new file mode 100644 index 0000000..0e87a2c --- /dev/null +++ b/src/tests/dummy.c @@ -0,0 +1,70 @@ +#include "gpu_tests.h" + +#include <libplacebo/dummy.h> + +int main() +{ + pl_log log = pl_test_logger(); + pl_gpu gpu = pl_gpu_dummy_create(log, NULL); + pl_buffer_tests(gpu); + pl_texture_tests(gpu); + + // Attempt creating a shader and accessing the resulting LUT + pl_tex dummy = pl_tex_dummy_create(gpu, pl_tex_dummy_params( + .w = 100, + .h = 100, + .format = pl_find_named_fmt(gpu, "rgba8"), + )); + + struct pl_sample_src src = { + .tex = dummy, + .new_w = 1000, + .new_h = 1000, + }; + + pl_shader_obj lut = NULL; + struct pl_sample_filter_params filter_params = { + .filter = pl_filter_ewa_lanczos, + .lut = &lut, + }; + + pl_shader sh = pl_shader_alloc(log, pl_shader_params( .gpu = gpu )); + REQUIRE(pl_shader_sample_polar(sh, &src, &filter_params)); + const struct pl_shader_res *res = pl_shader_finalize(sh); + REQUIRE(res); + + for (int n = 0; n < res->num_descriptors; n++) { + const struct pl_shader_desc *sd = &res->descriptors[n]; + if (sd->desc.type != PL_DESC_SAMPLED_TEX) + continue; + + pl_tex tex = sd->binding.object; + const float *data = (float *) pl_tex_dummy_data(tex); + if (!data) + continue; // means this was the `dummy` texture + +#ifdef PRINT_LUTS + for (int i = 0; i < tex->params.w; i++) + printf("lut[%d] = %f\n", i, data[i]); +#endif + } + + // Try out generation of the sampler2D interface + src.tex = NULL; + src.tex_w = 100; + src.tex_h = 100; + src.format = PL_FMT_UNORM; + src.sampler = PL_SAMPLER_NORMAL; + src.mode = PL_TEX_SAMPLE_LINEAR; + + pl_shader_reset(sh, pl_shader_params( .gpu = gpu )); + REQUIRE(pl_shader_sample_polar(sh, &src, &filter_params)); + REQUIRE((res = pl_shader_finalize(sh))); + REQUIRE_CMP(res->input, ==, PL_SHADER_SIG_SAMPLER, "u"); + + pl_shader_free(&sh); + pl_shader_obj_destroy(&lut); + pl_tex_destroy(gpu, &dummy); + pl_gpu_dummy_destroy(&gpu); + pl_log_destroy(&log); +} diff --git a/src/tests/filters.c b/src/tests/filters.c new file mode 100644 index 0000000..b6b323c --- /dev/null +++ b/src/tests/filters.c @@ -0,0 +1,81 @@ +#include "tests.h" + +#include <libplacebo/filters.h> + +int main() +{ + pl_log log = pl_test_logger(); + + for (int i = 0; i < pl_num_filter_functions; i++) { + const struct pl_filter_function *fun = pl_filter_functions[i]; + if (fun->opaque) + continue; + + printf("Testing filter function '%s'\n", fun->name); + + struct pl_filter_ctx ctx = { .radius = fun->radius }; + memcpy(ctx.params, fun->params, sizeof(ctx.params)); + + // Ensure the kernel is correctly scaled + REQUIRE_FEQ(fun->weight(&ctx, 0.0), 1.0, 1e-7); + + // Only box filters are radius 1, these are unwindowed by design. + // Gaussian technically never reaches 0 even at its preconfigured radius. + if (fun->radius > 1.0 && fun != &pl_filter_function_gaussian) + REQUIRE_FEQ(fun->weight(&ctx, fun->radius), 0.0, 1e-7); + } + + for (int c = 0; c < pl_num_filter_configs; c++) { + const struct pl_filter_config *conf = pl_filter_configs[c]; + if (conf->kernel->opaque) + continue; + + printf("Testing filter config '%s'\n", conf->name); + pl_filter flt = pl_filter_generate(log, pl_filter_params( + .config = *conf, + .lut_entries = 256, + .cutoff = 1e-3, + )); + REQUIRE(flt); + const float radius = PL_DEF(conf->radius, conf->kernel->radius); + REQUIRE_CMP(flt->radius, <=, radius, "f"); + REQUIRE_CMP(flt->radius_zero, >, 0.0, "f"); + REQUIRE_CMP(flt->radius_zero, <=, flt->radius, "f"); + + if (conf->polar) { + + // Test LUT accuracy + const int range = flt->params.lut_entries - 1; + double scale = flt->weights[0] / pl_filter_sample(conf, 0.0); + double err = 0.0; + for (float k = 0.0; k <= 1.0; k += 1e-3f) { + double ref = scale * pl_filter_sample(conf, k * flt->radius); + double idx = k * range; + int base = floorf(idx); + double fpart = idx - base; + int next = PL_MIN(base + 1, range); + double interp = PL_MIX(flt->weights[base], flt->weights[next], fpart); + err = fmaxf(err, fabs(interp - ref)); + } + REQUIRE_CMP(err, <=, 1e-4, "g"); + + } else { + + // Ensure the weights for each row add up to unity + for (int i = 0; i < flt->params.lut_entries; i++) { + const float *row = flt->weights + i * flt->row_stride; + float sum = 0.0; + REQUIRE(flt->row_size); + REQUIRE_CMP(flt->row_stride, >=, flt->row_size, "d"); + for (int n = 0; n < flt->row_size; n++) + sum += row[n]; + REQUIRE_FEQ(sum, 1.0, 1e-6); + } + + } + + pl_filter_free(&flt); + } + + pl_log_destroy(&log); +} diff --git a/src/tests/fuzz/lut.c b/src/tests/fuzz/lut.c new file mode 100644 index 0000000..24e5f89 --- /dev/null +++ b/src/tests/fuzz/lut.c @@ -0,0 +1,24 @@ +#include "../tests.h" + +#include <libplacebo/shaders/lut.h> + +__AFL_FUZZ_INIT(); + +#pragma clang optimize off + +int main() +{ + struct pl_custom_lut *lut; + +#ifdef __AFL_HAVE_MANUAL_CONTROL + __AFL_INIT(); +#endif + + unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF; + + while (__AFL_LOOP(100000)) { + size_t len = __AFL_FUZZ_TESTCASE_LEN; + lut = pl_lut_parse_cube(NULL, (char *) buf, len); + pl_lut_free(&lut); + } +} diff --git a/src/tests/fuzz/options.c b/src/tests/fuzz/options.c new file mode 100644 index 0000000..c88e462 --- /dev/null +++ b/src/tests/fuzz/options.c @@ -0,0 +1,26 @@ +#include "../tests.h" + +#include <libplacebo/options.h> + +__AFL_FUZZ_INIT(); + +#pragma clang optimize off + +int main() +{ + pl_options opts = pl_options_alloc(NULL); + +#ifdef __AFL_HAVE_MANUAL_CONTROL + __AFL_INIT(); +#endif + + unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF; + + while (__AFL_LOOP(100000)) { + size_t len = __AFL_FUZZ_TESTCASE_LEN; + buf[len - 1] = '\0'; // ensure proper null termination + pl_options_load(opts, (const char *) buf); + pl_options_save(opts); + pl_options_reset(opts, NULL); + } +} diff --git a/src/tests/fuzz/shaders.c b/src/tests/fuzz/shaders.c new file mode 100644 index 0000000..2e3e92c --- /dev/null +++ b/src/tests/fuzz/shaders.c @@ -0,0 +1,166 @@ +#include "../tests.h" +#include "shaders.h" + +#include <libplacebo/dummy.h> +#include <libplacebo/shaders/colorspace.h> +#include <libplacebo/shaders/custom.h> +#include <libplacebo/shaders/sampling.h> + +__AFL_FUZZ_INIT(); + +#pragma clang optimize off + +int main() +{ + pl_gpu gpu = pl_gpu_dummy_create(NULL, NULL); + +#define WIDTH 64 +#define HEIGHT 64 +#define COMPS 4 + + static const float empty[HEIGHT][WIDTH][COMPS] = {0}; + + struct pl_sample_src src = { + .tex = pl_tex_create(gpu, pl_tex_params( + .format = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, 0, 32, PL_FMT_CAP_SAMPLEABLE), + .initial_data = empty, + .sampleable = true, + .w = WIDTH, + .h = HEIGHT, + )), + .new_w = WIDTH * 2, + .new_h = HEIGHT * 2, + }; + + if (!src.tex) + return 1; + +#ifdef __AFL_HAVE_MANUAL_CONTROL + __AFL_INIT(); +#endif + + unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF; + while (__AFL_LOOP(10000)) { + +#define STACK_SIZE 16 + pl_shader stack[STACK_SIZE] = {0}; + int idx = 0; + + stack[0] = pl_shader_alloc(NULL, pl_shader_params( + .gpu = gpu, + )); + + pl_shader sh = stack[idx]; + pl_shader_obj polar = NULL, ortho = NULL, peak = NULL, dither = NULL; + + size_t len = __AFL_FUZZ_TESTCASE_LEN; + for (size_t pos = 0; pos < len; pos++) { + switch (buf[pos]) { + // Sampling steps + case 'S': + pl_shader_sample_direct(sh, &src); + break; + case 'D': + pl_shader_deband(sh, &src, NULL); + break; + case 'P': + pl_shader_sample_polar(sh, &src, pl_sample_filter_params( + .filter = pl_filter_ewa_lanczos, + .lut = &polar, + )); + break; + case 'O': ; + struct pl_sample_src srcfix = src; + srcfix.new_w = WIDTH; + pl_shader_sample_ortho2(sh, &srcfix, pl_sample_filter_params( + .filter = pl_filter_spline36, + .lut = &ortho, + )); + break; + case 'X': + pl_shader_custom(sh, &(struct pl_custom_shader) { + .input = PL_SHADER_SIG_NONE, + .output = PL_SHADER_SIG_COLOR, + .body = "// merge subpasses", + }); + break; + + // Colorspace transformation steps + case 'y': { + struct pl_color_repr repr = pl_color_repr_jpeg; + pl_shader_decode_color(sh, &repr, NULL); + break; + } + case 'p': + pl_shader_detect_peak(sh, pl_color_space_hdr10, &peak, NULL); + break; + case 'm': + pl_shader_color_map(sh, NULL, pl_color_space_bt709, + pl_color_space_monitor, NULL, false); + break; + case 't': + pl_shader_color_map(sh, NULL, pl_color_space_hdr10, + pl_color_space_monitor, &peak, false); + break; + case 'd': + pl_shader_dither(sh, 8, &dither, pl_dither_params( + // Picked to speed up calculation + .method = PL_DITHER_ORDERED_LUT, + .lut_size = 2, + )); + break; + + // Push and pop subshader commands + case '(': + if (idx+1 == STACK_SIZE) + goto invalid; + + idx++; + if (!stack[idx]) { + stack[idx] = pl_shader_alloc(NULL, pl_shader_params( + .gpu = gpu, + .id = idx, + )); + } + sh = stack[idx]; + break; + + case ')': + if (idx == 0) + goto invalid; + + idx--; + sh_subpass(stack[idx], stack[idx + 1]); + pl_shader_reset(stack[idx + 1], pl_shader_params( + .gpu = gpu, + .id = idx + 1, + )); + sh = stack[idx]; + break; + + default: + goto invalid; + } + } + + // Merge remaining shaders + while (idx > 0) { + sh_subpass(stack[idx - 1], stack[idx]); + idx--; + } + + pl_shader_finalize(stack[0]); + +invalid: + for (int i = 0; i < STACK_SIZE; i++) + pl_shader_free(&stack[i]); + + pl_shader_obj_destroy(&polar); + pl_shader_obj_destroy(&ortho); + pl_shader_obj_destroy(&peak); + pl_shader_obj_destroy(&dither); + } + + pl_tex_destroy(gpu, &src.tex); + pl_gpu_dummy_destroy(&gpu); +} diff --git a/src/tests/fuzz/user_shaders.c b/src/tests/fuzz/user_shaders.c new file mode 100644 index 0000000..bbb98c8 --- /dev/null +++ b/src/tests/fuzz/user_shaders.c @@ -0,0 +1,28 @@ +#include "../tests.h" + +#include <libplacebo/dummy.h> +#include <libplacebo/shaders/custom.h> + +__AFL_FUZZ_INIT(); + +#pragma clang optimize off + +int main() +{ + pl_gpu gpu = pl_gpu_dummy_create(NULL, NULL); + const struct pl_hook *hook; + +#ifdef __AFL_HAVE_MANUAL_CONTROL + __AFL_INIT(); +#endif + + unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF; + + while (__AFL_LOOP(100000)) { + size_t len = __AFL_FUZZ_TESTCASE_LEN; + hook = pl_mpv_user_shader_parse(gpu, (char *) buf, len); + pl_mpv_user_shader_destroy(&hook); + } + + pl_gpu_dummy_destroy(&gpu); +} diff --git a/src/tests/gpu_tests.h b/src/tests/gpu_tests.h new file mode 100644 index 0000000..f14f260 --- /dev/null +++ b/src/tests/gpu_tests.h @@ -0,0 +1,1741 @@ +#include "tests.h" +#include "shaders.h" + +#include <libplacebo/renderer.h> +#include <libplacebo/utils/frame_queue.h> +#include <libplacebo/utils/upload.h> + +//#define PRINT_OUTPUT + +static void pl_buffer_tests(pl_gpu gpu) +{ + const size_t buf_size = 1024; + if (buf_size > gpu->limits.max_buf_size) + return; + + uint8_t *test_src = malloc(buf_size * 2); + uint8_t *test_dst = test_src + buf_size; + assert(test_src && test_dst); + memset(test_dst, 0, buf_size); + for (int i = 0; i < buf_size; i++) + test_src[i] = RANDOM_U8; + + pl_buf buf = NULL, tbuf = NULL; + + printf("test buffer static creation and readback\n"); + buf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .host_readable = true, + .initial_data = test_src, + )); + + REQUIRE(buf); + REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size)); + REQUIRE_MEMEQ(test_src, test_dst, buf_size); + pl_buf_destroy(gpu, &buf); + + printf("test buffer empty creation, update and readback\n"); + memset(test_dst, 0, buf_size); + buf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .host_writable = true, + .host_readable = true, + )); + + REQUIRE(buf); + pl_buf_write(gpu, buf, 0, test_src, buf_size); + REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size)); + REQUIRE_MEMEQ(test_src, test_dst, buf_size); + pl_buf_destroy(gpu, &buf); + + printf("test buffer-buffer copy and readback\n"); + memset(test_dst, 0, buf_size); + buf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .initial_data = test_src, + )); + + tbuf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .host_readable = true, + )); + + REQUIRE(buf && tbuf); + pl_buf_copy(gpu, tbuf, 0, buf, 0, buf_size); + REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size)); + REQUIRE_MEMEQ(test_src, test_dst, buf_size); + pl_buf_destroy(gpu, &buf); + pl_buf_destroy(gpu, &tbuf); + + if (buf_size <= gpu->limits.max_mapped_size) { + printf("test host mapped buffer readback\n"); + buf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .host_mapped = true, + .initial_data = test_src, + )); + + REQUIRE(buf); + REQUIRE(!pl_buf_poll(gpu, buf, 0)); + REQUIRE_MEMEQ(test_src, buf->data, buf_size); + pl_buf_destroy(gpu, &buf); + } + + // `compute_queues` check is to exclude dummy GPUs here + if (buf_size <= gpu->limits.max_ssbo_size && gpu->limits.compute_queues) + { + printf("test endian swapping\n"); + buf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .storable = true, + .initial_data = test_src, + )); + + tbuf = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .storable = true, + .host_readable = true, + )); + + REQUIRE(buf && tbuf); + REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) { + .src = buf, + .dst = tbuf, + .size = buf_size, + .wordsize = 2, + })); + REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size)); + for (int i = 0; i < buf_size / 2; i++) { + REQUIRE_CMP(test_src[2 * i + 0], ==, test_dst[2 * i + 1], PRIu8); + REQUIRE_CMP(test_src[2 * i + 1], ==, test_dst[2 * i + 0], PRIu8); + } + // test endian swap in-place + REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) { + .src = tbuf, + .dst = tbuf, + .size = buf_size, + .wordsize = 4, + })); + REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size)); + for (int i = 0; i < buf_size / 4; i++) { + REQUIRE_CMP(test_src[4 * i + 0], ==, test_dst[4 * i + 2], PRIu8); + REQUIRE_CMP(test_src[4 * i + 1], ==, test_dst[4 * i + 3], PRIu8); + REQUIRE_CMP(test_src[4 * i + 2], ==, test_dst[4 * i + 0], PRIu8); + REQUIRE_CMP(test_src[4 * i + 3], ==, test_dst[4 * i + 1], PRIu8); + } + pl_buf_destroy(gpu, &buf); + pl_buf_destroy(gpu, &tbuf); + } + + free(test_src); +} + +static void test_cb(void *priv) +{ + bool *flag = priv; + *flag = true; +} + +static void pl_test_roundtrip(pl_gpu gpu, pl_tex tex[2], + uint8_t *src, uint8_t *dst) +{ + if (!tex[0] || !tex[1]) { + printf("failed creating test textures... skipping this test\n"); + return; + } + + int texels = tex[0]->params.w; + texels *= tex[0]->params.h ? tex[0]->params.h : 1; + texels *= tex[0]->params.d ? tex[0]->params.d : 1; + + pl_fmt fmt = tex[0]->params.format; + size_t bytes = texels * fmt->texel_size; + memset(src, 0, bytes); + memset(dst, 0, bytes); + + for (size_t i = 0; i < bytes; i++) + src[i] = RANDOM_U8; + + pl_timer ul, dl; + ul = pl_timer_create(gpu); + dl = pl_timer_create(gpu); + + bool ran_ul = false, ran_dl = false; + + REQUIRE(pl_tex_upload(gpu, &(struct pl_tex_transfer_params){ + .tex = tex[0], + .ptr = src, + .timer = ul, + .callback = gpu->limits.callbacks ? test_cb : NULL, + .priv = &ran_ul, + })); + + // Test blitting, if possible for this format + pl_tex dst_tex = tex[0]; + if (tex[0]->params.blit_src && tex[1]->params.blit_dst) { + pl_tex_clear_ex(gpu, tex[1], (union pl_clear_color){0}); // for testing + pl_tex_blit(gpu, &(struct pl_tex_blit_params) { + .src = tex[0], + .dst = tex[1], + }); + dst_tex = tex[1]; + } + + REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params){ + .tex = dst_tex, + .ptr = dst, + .timer = dl, + .callback = gpu->limits.callbacks ? test_cb : NULL, + .priv = &ran_dl, + })); + + pl_gpu_finish(gpu); + if (gpu->limits.callbacks) + REQUIRE(ran_ul && ran_dl); + + if (fmt->emulated && fmt->type == PL_FMT_FLOAT) { + // TODO: can't memcmp here because bits might be lost due to the + // emulated 16/32 bit upload paths, figure out a better way to + // generate data and verify the roundtrip! + } else { + REQUIRE_MEMEQ(src, dst, bytes); + } + + // Report timer results + printf("upload time: %"PRIu64", download time: %"PRIu64"\n", + pl_timer_query(gpu, ul), pl_timer_query(gpu, dl)); + + pl_timer_destroy(gpu, &ul); + pl_timer_destroy(gpu, &dl); +} + +static void pl_texture_tests(pl_gpu gpu) +{ + const size_t max_size = 16*16*16 * 4 *sizeof(double); + uint8_t *test_src = malloc(max_size * 2); + uint8_t *test_dst = test_src + max_size; + + for (int f = 0; f < gpu->num_formats; f++) { + pl_fmt fmt = gpu->formats[f]; + if (fmt->opaque || !(fmt->caps & PL_FMT_CAP_HOST_READABLE)) + continue; + + printf("testing texture roundtrip for format %s\n", fmt->name); + assert(fmt->texel_size <= 4 * sizeof(double)); + + struct pl_tex_params ref_params = { + .format = fmt, + .blit_src = (fmt->caps & PL_FMT_CAP_BLITTABLE), + .blit_dst = (fmt->caps & PL_FMT_CAP_BLITTABLE), + .host_writable = true, + .host_readable = true, + .debug_tag = PL_DEBUG_TAG, + }; + + pl_tex tex[2]; + + if (gpu->limits.max_tex_1d_dim >= 16) { + printf("... 1D\n"); + struct pl_tex_params params = ref_params; + params.w = 16; + if (!gpu->limits.blittable_1d_3d) + params.blit_src = params.blit_dst = false; + for (int i = 0; i < PL_ARRAY_SIZE(tex); i++) + tex[i] = pl_tex_create(gpu, ¶ms); + pl_test_roundtrip(gpu, tex, test_src, test_dst); + for (int i = 0; i < PL_ARRAY_SIZE(tex); i++) + pl_tex_destroy(gpu, &tex[i]); + } + + if (gpu->limits.max_tex_2d_dim >= 16) { + printf("... 2D\n"); + struct pl_tex_params params = ref_params; + params.w = params.h = 16; + for (int i = 0; i < PL_ARRAY_SIZE(tex); i++) + tex[i] = pl_tex_create(gpu, ¶ms); + pl_test_roundtrip(gpu, tex, test_src, test_dst); + for (int i = 0; i < PL_ARRAY_SIZE(tex); i++) + pl_tex_destroy(gpu, &tex[i]); + } + + if (gpu->limits.max_tex_3d_dim >= 16) { + printf("... 3D\n"); + struct pl_tex_params params = ref_params; + params.w = params.h = params.d = 16; + if (!gpu->limits.blittable_1d_3d) + params.blit_src = params.blit_dst = false; + for (int i = 0; i < PL_ARRAY_SIZE(tex); i++) + tex[i] = pl_tex_create(gpu, ¶ms); + pl_test_roundtrip(gpu, tex, test_src, test_dst); + for (int i = 0; i < PL_ARRAY_SIZE(tex); i++) + pl_tex_destroy(gpu, &tex[i]); + } + } + + free(test_src); +} + +static void pl_planar_tests(pl_gpu gpu) +{ + pl_fmt fmt = pl_find_named_fmt(gpu, "g8_b8_r8_420"); + if (!fmt) + return; + REQUIRE_CMP(fmt->num_planes, ==, 3, "d"); + + const int width = 64, height = 32; + pl_tex tex = pl_tex_create(gpu, pl_tex_params( + .w = width, + .h = height, + .format = fmt, + .blit_dst = true, + .host_readable = true, + )); + if (!tex) + return; + for (int i = 0; i < fmt->num_planes; i++) + REQUIRE(tex->planes[i]); + + pl_tex plane = tex->planes[1]; + uint8_t data[(width * height) >> 2]; + REQUIRE_CMP(plane->params.w * plane->params.h, ==, PL_ARRAY_SIZE(data), "d"); + + pl_tex_clear(gpu, plane, (float[]){ (float) 0x80 / 0xFF, 0.0, 0.0, 1.0 }); + REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params( + .tex = plane, + .ptr = data, + ))); + + uint8_t ref[PL_ARRAY_SIZE(data)]; + memset(ref, 0x80, sizeof(ref)); + REQUIRE_MEMEQ(data, ref, PL_ARRAY_SIZE(data)); + + pl_tex_destroy(gpu, &tex); +} + +static void pl_shader_tests(pl_gpu gpu) +{ + if (gpu->glsl.version < 410) + return; + + const char *vert_shader = + "#version 410 \n" + "layout(location=0) in vec2 vertex_pos; \n" + "layout(location=1) in vec3 vertex_color; \n" + "layout(location=0) out vec3 frag_color; \n" + "void main() { \n" + " gl_Position = vec4(vertex_pos, 0, 1); \n" + " frag_color = vertex_color; \n" + "}"; + + const char *frag_shader = + "#version 410 \n" + "layout(location=0) in vec3 frag_color; \n" + "layout(location=0) out vec4 out_color; \n" + "void main() { \n" + " out_color = vec4(frag_color, 1.0); \n" + "}"; + + pl_fmt fbo_fmt; + enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE | + PL_FMT_CAP_LINEAR; + + fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 4, 16, 32, caps); + if (!fbo_fmt) + return; + +#define FBO_W 16 +#define FBO_H 16 + + pl_tex fbo; + fbo = pl_tex_create(gpu, &(struct pl_tex_params) { + .format = fbo_fmt, + .w = FBO_W, + .h = FBO_H, + .renderable = true, + .storable = !!(fbo_fmt->caps & PL_FMT_CAP_STORABLE), + .host_readable = true, + .blit_dst = true, + }); + REQUIRE(fbo); + + pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0}); + + pl_fmt vert_fmt; + vert_fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3); + REQUIRE(vert_fmt); + + static const struct vertex { float pos[2]; float color[3]; } vertices[] = { + {{-1.0, -1.0}, {0, 0, 0}}, + {{ 1.0, -1.0}, {1, 0, 0}}, + {{-1.0, 1.0}, {0, 1, 0}}, + {{ 1.0, 1.0}, {1, 1, 0}}, + }; + + pl_pass pass; + pass = pl_pass_create(gpu, &(struct pl_pass_params) { + .type = PL_PASS_RASTER, + .target_format = fbo_fmt, + .vertex_shader = vert_shader, + .glsl_shader = frag_shader, + + .vertex_type = PL_PRIM_TRIANGLE_STRIP, + .vertex_stride = sizeof(struct vertex), + .num_vertex_attribs = 2, + .vertex_attribs = (struct pl_vertex_attrib[]) {{ + .name = "vertex_pos", + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2), + .location = 0, + .offset = offsetof(struct vertex, pos), + }, { + .name = "vertex_color", + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3), + .location = 1, + .offset = offsetof(struct vertex, color), + }}, + }); + REQUIRE(pass); + if (pass->params.cached_program || pass->params.cached_program_len) { + // Ensure both are set if either one is set + REQUIRE(pass->params.cached_program); + REQUIRE(pass->params.cached_program_len); + } + + pl_timer timer = pl_timer_create(gpu); + pl_pass_run(gpu, &(struct pl_pass_run_params) { + .pass = pass, + .target = fbo, + .vertex_count = PL_ARRAY_SIZE(vertices), + .vertex_data = vertices, + .timer = timer, + }); + + // Wait until this pass is complete and report the timer result + pl_gpu_finish(gpu); + printf("timer query result: %"PRIu64"\n", pl_timer_query(gpu, timer)); + pl_timer_destroy(gpu, &timer); + + static float test_data[FBO_H * FBO_W * 4] = {0}; + + // Test against the known pattern of `src`, only useful for roundtrip tests +#define TEST_FBO_PATTERN(eps, fmt, ...) \ + do { \ + printf("testing pattern of " fmt "\n", __VA_ARGS__); \ + REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) { \ + .tex = fbo, \ + .ptr = test_data, \ + })); \ + \ + for (int y = 0; y < FBO_H; y++) { \ + for (int x = 0; x < FBO_W; x++) { \ + float *color = &test_data[(y * FBO_W + x) * 4]; \ + REQUIRE_FEQ(color[0], (x + 0.5) / FBO_W, eps); \ + REQUIRE_FEQ(color[1], (y + 0.5) / FBO_H, eps); \ + REQUIRE_FEQ(color[2], 0.0, eps); \ + REQUIRE_FEQ(color[3], 1.0, eps); \ + } \ + } \ + } while (0) + + TEST_FBO_PATTERN(1e-6, "%s", "initial rendering"); + + if (sizeof(vertices) <= gpu->limits.max_vbo_size) { + // Test the use of an explicit vertex buffer + pl_buf vert = pl_buf_create(gpu, &(struct pl_buf_params) { + .size = sizeof(vertices), + .initial_data = vertices, + .drawable = true, + }); + + REQUIRE(vert); + pl_pass_run(gpu, &(struct pl_pass_run_params) { + .pass = pass, + .target = fbo, + .vertex_count = sizeof(vertices) / sizeof(struct vertex), + .vertex_buf = vert, + .buf_offset = 0, + }); + + pl_buf_destroy(gpu, &vert); + TEST_FBO_PATTERN(1e-6, "%s", "using vertex buffer"); + } + + // Test the use of index buffers + static const uint16_t indices[] = { 3, 2, 1, 0 }; + pl_pass_run(gpu, &(struct pl_pass_run_params) { + .pass = pass, + .target = fbo, + .vertex_count = PL_ARRAY_SIZE(indices), + .vertex_data = vertices, + .index_data = indices, + }); + + pl_pass_destroy(gpu, &pass); + TEST_FBO_PATTERN(1e-6, "%s", "using indexed rendering"); + + // Test the use of pl_dispatch + pl_dispatch dp = pl_dispatch_create(gpu->log, gpu); + pl_shader sh = pl_dispatch_begin(dp); + REQUIRE(pl_shader_custom(sh, &(struct pl_custom_shader) { + .body = "color = vec4(col, 1.0);", + .input = PL_SHADER_SIG_NONE, + .output = PL_SHADER_SIG_COLOR, + })); + + REQUIRE(pl_dispatch_vertex(dp, &(struct pl_dispatch_vertex_params) { + .shader = &sh, + .target = fbo, + .vertex_stride = sizeof(struct vertex), + .vertex_position_idx = 0, + .num_vertex_attribs = 2, + .vertex_attribs = (struct pl_vertex_attrib[]) {{ + .name = "pos", + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2), + .offset = offsetof(struct vertex, pos), + }, { + .name = "col", + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3), + .offset = offsetof(struct vertex, color), + }}, + + .vertex_type = PL_PRIM_TRIANGLE_STRIP, + .vertex_coords = PL_COORDS_NORMALIZED, + .vertex_count = PL_ARRAY_SIZE(vertices), + .vertex_data = vertices, + })); + + TEST_FBO_PATTERN(1e-6, "%s", "using custom vertices"); + + static float src_data[FBO_H * FBO_W * 4] = {0}; + memcpy(src_data, test_data, sizeof(src_data)); + + pl_tex src; + src = pl_tex_create(gpu, &(struct pl_tex_params) { + .format = fbo_fmt, + .w = FBO_W, + .h = FBO_H, + .storable = fbo->params.storable, + .sampleable = true, + .initial_data = src_data, + }); + + if (fbo->params.storable) { + // Test 1x1 blit, to make sure the scaling code runs + REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) { + .src = src, + .dst = fbo, + .src_rc = {0, 0, 0, 1, 1, 1}, + .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1}, + .sample_mode = PL_TEX_SAMPLE_NEAREST, + })); + + // Test non-resizing blit, which uses the efficient imageLoad path + REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) { + .src = src, + .dst = fbo, + .src_rc = {0, 0, 0, FBO_W, FBO_H, 1}, + .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1}, + .sample_mode = PL_TEX_SAMPLE_NEAREST, + })); + + TEST_FBO_PATTERN(1e-6, "%s", "pl_tex_blit_compute"); + } + + // Test encoding/decoding of all gamma functions, color spaces, etc. + for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) { + struct pl_color_space test_csp = { + .transfer = trc, + .hdr.min_luma = PL_COLOR_HDR_BLACK, + }; + sh = pl_dispatch_begin(dp); + pl_shader_sample_nearest(sh, pl_sample_src( .tex = src )); + pl_shader_delinearize(sh, &test_csp); + pl_shader_linearize(sh, &test_csp); + REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params( + .shader = &sh, + .target = fbo, + ))); + + float epsilon = pl_color_transfer_is_hdr(trc) ? 1e-4 : 1e-6; + TEST_FBO_PATTERN(epsilon, "transfer function %d", (int) trc); + } + + for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) { + if (sys == PL_COLOR_SYSTEM_DOLBYVISION) + continue; // requires metadata + sh = pl_dispatch_begin(dp); + pl_shader_sample_nearest(sh, pl_sample_src( .tex = src )); + pl_shader_encode_color(sh, &(struct pl_color_repr) { .sys = sys }); + pl_shader_decode_color(sh, &(struct pl_color_repr) { .sys = sys }, NULL); + REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) { + .shader = &sh, + .target = fbo, + })); + + float epsilon; + switch (sys) { + case PL_COLOR_SYSTEM_BT_2020_C: + case PL_COLOR_SYSTEM_XYZ: + epsilon = 1e-5; + break; + + case PL_COLOR_SYSTEM_BT_2100_PQ: + case PL_COLOR_SYSTEM_BT_2100_HLG: + // These seem to be horrifically noisy and prone to breaking on + // edge cases for some reason + // TODO: figure out why! + continue; + + default: epsilon = 1e-6; break; + } + + TEST_FBO_PATTERN(epsilon, "color system %d", (int) sys); + } + + // Repeat this a few times to test the caching + pl_cache cache = pl_cache_create(pl_cache_params( .log = gpu->log )); + pl_gpu_set_cache(gpu, cache); + for (int i = 0; i < 10; i++) { + if (i == 5) { + printf("Recreating pl_dispatch to test the caching\n"); + size_t size = pl_dispatch_save(dp, NULL); + REQUIRE(size); + uint8_t *cache_data = malloc(size); + REQUIRE(cache_data); + REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu"); + + pl_dispatch_destroy(&dp); + dp = pl_dispatch_create(gpu->log, gpu); + pl_dispatch_load(dp, cache_data); + + // Test to make sure the pass regenerates the same cache + uint64_t hash = pl_str_hash((pl_str) { cache_data, size }); + REQUIRE_CMP(pl_dispatch_save(dp, NULL), ==, size, "zu"); + REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu"); + REQUIRE_CMP(pl_str_hash((pl_str) { cache_data, size }), ==, hash, PRIu64); + free(cache_data); + } + + sh = pl_dispatch_begin(dp); + + // For testing, force the use of CS if possible + if (gpu->glsl.compute) { + sh->type = SH_COMPUTE; + sh->group_size[0] = 8; + sh->group_size[1] = 8; + } + + pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params( + .iterations = 0, + .grain = 0.0, + )); + + REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) { + .shader = &sh, + .target = fbo, + })); + TEST_FBO_PATTERN(1e-6, "deband iter %d", i); + } + + pl_gpu_set_cache(gpu, NULL); + pl_cache_destroy(&cache); + + // Test peak detection and readback if possible + sh = pl_dispatch_begin(dp); + pl_shader_sample_nearest(sh, pl_sample_src( .tex = src )); + + pl_shader_obj peak_state = NULL; + struct pl_color_space csp_gamma22 = { .transfer = PL_COLOR_TRC_GAMMA22 }; + struct pl_peak_detect_params peak_params = { .minimum_peak = 0.01 }; + if (pl_shader_detect_peak(sh, csp_gamma22, &peak_state, &peak_params)) { + REQUIRE(pl_dispatch_compute(dp, &(struct pl_dispatch_compute_params) { + .shader = &sh, + .width = fbo->params.w, + .height = fbo->params.h, + })); + + float peak, avg; + REQUIRE(pl_get_detected_peak(peak_state, &peak, &avg)); + + float real_peak = 0, real_avg = 0; + for (int y = 0; y < FBO_H; y++) { + for (int x = 0; x < FBO_W; x++) { + float *color = &src_data[(y * FBO_W + x) * 4]; + float luma = 0.212639f * powf(color[0], 2.2f) + + 0.715169f * powf(color[1], 2.2f) + + 0.072192f * powf(color[2], 2.2f); + luma = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, luma); + real_peak = PL_MAX(real_peak, luma); + real_avg += luma; + } + } + real_avg = real_avg / (FBO_W * FBO_H); + + real_avg = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_avg); + real_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_peak); + REQUIRE_FEQ(peak, real_peak, 1e-3); + REQUIRE_FEQ(avg, real_avg, 1e-2); + } + + pl_dispatch_abort(dp, &sh); + pl_shader_obj_destroy(&peak_state); + + // Test film grain synthesis + pl_shader_obj grain = NULL; + struct pl_film_grain_params grain_params = { + .tex = src, + .components = 3, + .component_mapping = { 0, 1, 2}, + .repr = &(struct pl_color_repr) { + .sys = PL_COLOR_SYSTEM_BT_709, + .levels = PL_COLOR_LEVELS_LIMITED, + .bits = { .color_depth = 10, .sample_depth = 10 }, + }, + }; + + for (int i = 0; i < 2; i++) { + grain_params.data.type = PL_FILM_GRAIN_AV1; + grain_params.data.params.av1 = av1_grain_data; + grain_params.data.params.av1.overlap = !!i; + grain_params.data.seed = rand(); + + sh = pl_dispatch_begin(dp); + pl_shader_film_grain(sh, &grain, &grain_params); + REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) { + .shader = &sh, + .target = fbo, + })); + } + + if (gpu->glsl.compute) { + grain_params.data.type = PL_FILM_GRAIN_H274; + grain_params.data.params.h274 = h274_grain_data; + grain_params.data.seed = rand(); + + sh = pl_dispatch_begin(dp); + pl_shader_film_grain(sh, &grain, &grain_params); + REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) { + .shader = &sh, + .target = fbo, + })); + } + pl_shader_obj_destroy(&grain); + + // Test custom shaders + struct pl_custom_shader custom = { + .header = + "vec3 invert(vec3 color) \n" + "{ \n" + " return vec3(1.0) - color; \n" + "} \n", + + .body = + "color = vec4(gl_FragCoord.xy, 0.0, 1.0); \n" + "color.rgb = invert(color.rgb) + offset; \n", + + .input = PL_SHADER_SIG_NONE, + .output = PL_SHADER_SIG_COLOR, + + .num_variables = 1, + .variables = &(struct pl_shader_var) { + .var = pl_var_float("offset"), + .data = &(float) { 0.1 }, + }, + }; + + sh = pl_dispatch_begin(dp); + REQUIRE(pl_shader_custom(sh, &custom)); + REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) { + .shader = &sh, + .target = fbo, + })); + + // Test dolbyvision + struct pl_color_repr repr = { + .sys = PL_COLOR_SYSTEM_DOLBYVISION, + .dovi = &dovi_meta, + }; + + sh = pl_dispatch_begin(dp); + pl_shader_sample_direct(sh, pl_sample_src( .tex = src )); + pl_shader_decode_color(sh, &repr, NULL); + REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) { + .shader = &sh, + .target = fbo, + })); + + // Test deinterlacing + sh = pl_dispatch_begin(dp); + pl_shader_deinterlace(sh, pl_deinterlace_source( .cur = pl_field_pair(src) ), NULL); + REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params( + .shader = &sh, + .target = fbo, + ))); + + // Test error diffusion + if (fbo->params.storable) { + for (int i = 0; i < pl_num_error_diffusion_kernels; i++) { + const struct pl_error_diffusion_kernel *k = pl_error_diffusion_kernels[i]; + printf("testing error diffusion kernel '%s'\n", k->name); + sh = pl_dispatch_begin(dp); + bool ok = pl_shader_error_diffusion(sh, pl_error_diffusion_params( + .input_tex = src, + .output_tex = fbo, + .new_depth = 8, + .kernel = k, + )); + + if (!ok) { + fprintf(stderr, "kernel '%s' exceeds GPU limits, skipping...\n", k->name); + continue; + } + + REQUIRE(pl_dispatch_compute(dp, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = {1, 1, 1}, + ))); + } + } + + pl_dispatch_destroy(&dp); + pl_tex_destroy(gpu, &src); + pl_tex_destroy(gpu, &fbo); +} + +static void pl_scaler_tests(pl_gpu gpu) +{ + pl_fmt src_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_LINEAR); + pl_fmt fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_RENDERABLE); + if (!src_fmt || !fbo_fmt) + return; + + float *fbo_data = NULL; + pl_shader_obj lut = NULL; + + static float data_5x5[5][5] = { + { 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0 }, + { 0, 0, 1, 0, 0 }, + { 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0 }, + }; + + pl_tex dot5x5 = pl_tex_create(gpu, &(struct pl_tex_params) { + .w = 5, + .h = 5, + .format = src_fmt, + .sampleable = true, + .initial_data = &data_5x5[0][0], + }); + + struct pl_tex_params fbo_params = { + .w = 100, + .h = 100, + .format = fbo_fmt, + .renderable = true, + .storable = fbo_fmt->caps & PL_FMT_CAP_STORABLE, + .host_readable = fbo_fmt->caps & PL_FMT_CAP_HOST_READABLE, + }; + + pl_tex fbo = pl_tex_create(gpu, &fbo_params); + pl_dispatch dp = pl_dispatch_create(gpu->log, gpu); + if (!dot5x5 || !fbo || !dp) + goto error; + + pl_shader sh = pl_dispatch_begin(dp); + REQUIRE(pl_shader_sample_polar(sh, + pl_sample_src( + .tex = dot5x5, + .new_w = fbo->params.w, + .new_h = fbo->params.h, + ), + pl_sample_filter_params( + .filter = pl_filter_ewa_lanczos, + .lut = &lut, + .no_compute = !fbo->params.storable, + ) + )); + REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) { + .shader = &sh, + .target = fbo, + })); + + if (fbo->params.host_readable) { + fbo_data = malloc(fbo->params.w * fbo->params.h * sizeof(float)); + REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) { + .tex = fbo, + .ptr = fbo_data, + })); + +#ifdef PRINT_OUTPUT + int max = 255; + printf("P2\n%d %d\n%d\n", fbo->params.w, fbo->params.h, max); + for (int y = 0; y < fbo->params.h; y++) { + for (int x = 0; x < fbo->params.w; x++) { + float v = fbo_data[y * fbo->params.h + x]; + printf("%d ", (int) round(fmin(fmax(v, 0.0), 1.0) * max)); + } + printf("\n"); + } +#endif + } + +error: + free(fbo_data); + pl_shader_obj_destroy(&lut); + pl_dispatch_destroy(&dp); + pl_tex_destroy(gpu, &dot5x5); + pl_tex_destroy(gpu, &fbo); +} + +static const char *user_shader_tests[] = { + // Test hooking, saving and loading + "// Example of a comment at the beginning \n" + " \n" + "//!HOOK NATIVE \n" + "//!DESC upscale image \n" + "//!BIND HOOKED \n" + "//!WIDTH HOOKED.w 10 * \n" + "//!HEIGHT HOOKED.h 10 * \n" + "//!SAVE NATIVEBIG \n" + "//!WHEN NATIVE.w 500 < \n" + " \n" + "vec4 hook() \n" + "{ \n" + " return HOOKED_texOff(0); \n" + "} \n" + " \n" + "//!HOOK MAIN \n" + "//!DESC downscale bigger image \n" + "//!WHEN NATIVE.w 500 < \n" + "//!BIND NATIVEBIG \n" + " \n" + "vec4 hook() \n" + "{ \n" + " return NATIVEBIG_texOff(0); \n" + "} \n", + + // Test use of textures + "//!HOOK MAIN \n" + "//!DESC turn everything into colorful pixels \n" + "//!BIND HOOKED \n" + "//!BIND DISCO \n" + "//!COMPONENTS 3 \n" + " \n" + "vec4 hook() \n" + "{ \n" + " return vec4(DISCO_tex(HOOKED_pos * 10.0).rgb, 1); \n" + "} \n" + " \n" + "//!TEXTURE DISCO \n" + "//!SIZE 3 3 \n" + "//!FORMAT rgba8 \n" + "//!FILTER NEAREST \n" + "//!BORDER REPEAT \n" + "ff0000ff00ff00ff0000ffff00ffffffff00ffffffff00ff4c4c4cff999999ffffffffff\n" + + // Test custom parameters + "//!PARAM test \n" + "//!DESC test parameter \n" + "//!TYPE DYNAMIC float \n" + "//!MINIMUM 0.0 \n" + "//!MAXIMUM 100.0 \n" + "1.0 \n" + " \n" + "//!PARAM testconst \n" + "//!TYPE CONSTANT uint \n" + "//!MAXIMUM 16 \n" + "3 \n" + " \n" + "//!PARAM testdefine \n" + "//!TYPE DEFINE \n" + "100 \n" + " \n" + "//!PARAM testenum \n" + "//!TYPE ENUM DEFINE \n" + "FOO \n" + "BAR \n" + " \n" + "//!HOOK MAIN \n" + "//!WHEN testconst 30 > \n" + "#error should not be run \n" + " \n" + "//!HOOK MAIN \n" + "//!WHEN testenum FOO = \n" + "#if testenum == BAR \n" + " #error bad \n" + "#endif \n" + "vec4 hook() { return vec4(0.0); } \n" +}; + +static const char *compute_shader_tests[] = { + // Test use of storage/buffer resources + "//!HOOK MAIN \n" + "//!DESC attach some storage objects \n" + "//!BIND tex_storage \n" + "//!BIND buf_uniform \n" + "//!BIND buf_storage \n" + "//!COMPONENTS 4 \n" + " \n" + "vec4 hook() \n" + "{ \n" + " return vec4(foo, bar, bat); \n" + "} \n" + " \n" + "//!TEXTURE tex_storage \n" + "//!SIZE 100 100 \n" + "//!FORMAT r32f \n" + "//!STORAGE \n" + " \n" + "//!BUFFER buf_uniform \n" + "//!VAR float foo \n" + "//!VAR float bar \n" + "0000000000000000 \n" + " \n" + "//!BUFFER buf_storage \n" + "//!VAR vec2 bat \n" + "//!VAR int big[32]; \n" + "//!STORAGE \n", + +}; + +static const char *test_luts[] = { + + "TITLE \"1D identity\" \n" + "LUT_1D_SIZE 2 \n" + "0.0 0.0 0.0 \n" + "1.0 1.0 1.0 \n", + + "TITLE \"3D identity\" \n" + "LUT_3D_SIZE 2 \n" + "0.0 0.0 0.0 \n" + "1.0 0.0 0.0 \n" + "0.0 1.0 0.0 \n" + "1.0 1.0 0.0 \n" + "0.0 0.0 1.0 \n" + "1.0 0.0 1.0 \n" + "0.0 1.0 1.0 \n" + "1.0 1.0 1.0 \n" + +}; + +static bool frame_passthrough(pl_gpu gpu, pl_tex *tex, + const struct pl_source_frame *src, struct pl_frame *out_frame) +{ + const struct pl_frame *frame = src->frame_data; + *out_frame = *frame; + return true; +} + +static enum pl_queue_status get_frame_ptr(struct pl_source_frame *out_frame, + const struct pl_queue_params *qparams) +{ + const struct pl_source_frame **pframe = qparams->priv; + if (!(*pframe)->frame_data) + return PL_QUEUE_EOF; + + *out_frame = *(*pframe)++; + return PL_QUEUE_OK; +} + +static void render_info_cb(void *priv, const struct pl_render_info *info) +{ + printf("{%d} Executed shader: %s\n", info->index, + info->pass->shader->description); +} + +static void pl_render_tests(pl_gpu gpu) +{ + pl_tex img_tex = NULL, fbo = NULL; + pl_renderer rr = NULL; + + enum { width = 50, height = 50 }; + static float data[width][height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) + data[y][x] = RANDOM; + } + + struct pl_plane img_plane = {0}; + struct pl_plane_data plane_data = { + .type = PL_FMT_FLOAT, + .width = width, + .height = height, + .component_size = { 8 * sizeof(float) }, + .component_map = { 0 }, + .pixel_stride = sizeof(float), + .pixels = data, + }; + + if (!pl_recreate_plane(gpu, NULL, &fbo, &plane_data)) + return; + + if (!pl_upload_plane(gpu, &img_plane, &img_tex, &plane_data)) + goto error; + + rr = pl_renderer_create(gpu->log, gpu); + pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0}); + + struct pl_frame image = { + .num_planes = 1, + .planes = { img_plane }, + .repr = { + .sys = PL_COLOR_SYSTEM_BT_709, + .levels = PL_COLOR_LEVELS_FULL, + }, + .color = pl_color_space_srgb, + }; + + struct pl_frame target = { + .num_planes = 1, + .planes = {{ + .texture = fbo, + .components = 3, + .component_mapping = {0, 1, 2}, + }}, + .repr = { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_FULL, + .bits.color_depth = 32, + }, + .color = pl_color_space_srgb, + }; + + REQUIRE(pl_render_image(rr, &image, &target, NULL)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + + // TODO: embed a reference texture and ensure it matches + + // Test a bunch of different params +#define TEST(SNAME, STYPE, DEFAULT, FIELD, LIMIT) \ + do { \ + for (int i = 0; i <= LIMIT; i++) { \ + printf("testing `" #STYPE "." #FIELD " = %d`\n", i); \ + struct pl_render_params params = pl_render_default_params; \ + params.force_dither = true; \ + struct STYPE tmp = DEFAULT; \ + tmp.FIELD = i; \ + params.SNAME = &tmp; \ + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); \ + pl_gpu_flush(gpu); \ + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); \ + } \ + } while (0) + +#define TEST_PARAMS(NAME, FIELD, LIMIT) \ + TEST(NAME##_params, pl_##NAME##_params, pl_##NAME##_default_params, FIELD, LIMIT) + + image.crop.x1 = width / 2.0; + image.crop.y1 = height / 2.0; + for (int i = 0; i < pl_num_scale_filters; i++) { + struct pl_render_params params = pl_render_default_params; + params.upscaler = pl_scale_filters[i].filter; + printf("testing `params.upscaler = /* %s */`\n", pl_scale_filters[i].name); + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + pl_gpu_flush(gpu); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + } + image.crop.x1 = image.crop.y1 = 0; + + target.crop.x1 = width / 2.0; + target.crop.y1 = height / 2.0; + for (int i = 0; i < pl_num_scale_filters; i++) { + struct pl_render_params params = pl_render_default_params; + params.downscaler = pl_scale_filters[i].filter; + printf("testing `params.downscaler = /* %s */`\n", pl_scale_filters[i].name); + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + pl_gpu_flush(gpu); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + } + target.crop.x1 = target.crop.y1 = 0; + + TEST_PARAMS(deband, iterations, 3); + TEST_PARAMS(sigmoid, center, 1); + TEST_PARAMS(color_map, intent, PL_INTENT_ABSOLUTE_COLORIMETRIC); + TEST_PARAMS(dither, method, PL_DITHER_WHITE_NOISE); + TEST_PARAMS(dither, temporal, true); + TEST_PARAMS(distort, alpha_mode, PL_ALPHA_INDEPENDENT); + TEST_PARAMS(distort, constrain, true); + TEST_PARAMS(distort, bicubic, true); + TEST(cone_params, pl_cone_params, pl_vision_deuteranomaly, strength, 0); + + // Test gamma-correct dithering + target.repr.bits.color_depth = 2; + TEST_PARAMS(dither, transfer, PL_COLOR_TRC_GAMMA22); + target.repr.bits.color_depth = 32; + + // Test HDR tone mapping + image.color = pl_color_space_hdr10; + TEST_PARAMS(color_map, visualize_lut, true); + if (gpu->limits.max_ssbo_size) + TEST_PARAMS(peak_detect, allow_delayed, true); + + // Test inverse tone-mapping and pure BPC + image.color.hdr.max_luma = 1000; + target.color.hdr.max_luma = 4000; + target.color.hdr.min_luma = 0.02; + TEST_PARAMS(color_map, inverse_tone_mapping, true); + + image.color = pl_color_space_srgb; + target.color = pl_color_space_srgb; + + // Test some misc stuff + struct pl_render_params params = pl_render_default_params; + params.color_adjustment = &(struct pl_color_adjustment) { + .brightness = 0.1, + .contrast = 0.9, + .saturation = 1.5, + .gamma = 0.8, + .temperature = 0.3, + }; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + params = pl_render_default_params; + + struct pl_frame inferred_image = image, inferred_target = target; + pl_frames_infer(rr, &inferred_image, &inferred_target); + REQUIRE(pl_render_image(rr, &inferred_image, &inferred_target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + + // Test background blending and alpha transparency + params.blend_against_tiles = true; + params.corner_rounding = 0.25f; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + params = pl_render_default_params; + + // Test film grain synthesis + image.film_grain.type = PL_FILM_GRAIN_AV1; + image.film_grain.params.av1 = av1_grain_data; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + + image.film_grain.type = PL_FILM_GRAIN_H274; + image.film_grain.params.h274 = h274_grain_data; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + // H.274 film grain synthesis requires compute shaders + if (gpu->glsl.compute) { + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + } else { + const struct pl_render_errors rr_err = pl_renderer_get_errors(rr); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_FILM_GRAIN); + pl_renderer_reset_errors(rr, &rr_err); + } + image.film_grain = (struct pl_film_grain_data) {0}; + + // Test mpv-style custom shaders + for (int i = 0; i < PL_ARRAY_SIZE(user_shader_tests); i++) { + printf("testing user shader:\n\n%s\n", user_shader_tests[i]); + const struct pl_hook *hook; + hook = pl_mpv_user_shader_parse(gpu, user_shader_tests[i], + strlen(user_shader_tests[i])); + REQUIRE(hook); + + params.hooks = &hook; + params.num_hooks = 1; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + + pl_mpv_user_shader_destroy(&hook); + } + + if (gpu->glsl.compute && gpu->limits.max_ssbo_size) { + for (int i = 0; i < PL_ARRAY_SIZE(compute_shader_tests); i++) { + printf("testing user shader:\n\n%s\n", compute_shader_tests[i]); + const struct pl_hook *hook; + hook = pl_mpv_user_shader_parse(gpu, compute_shader_tests[i], + strlen(compute_shader_tests[i])); + REQUIRE(hook); + + params.hooks = &hook; + params.num_hooks = 1; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + + pl_mpv_user_shader_destroy(&hook); + } + } + params = pl_render_default_params; + + // Test custom LUTs + for (int i = 0; i < PL_ARRAY_SIZE(test_luts); i++) { + printf("testing custom lut %d\n", i); + struct pl_custom_lut *lut; + lut = pl_lut_parse_cube(gpu->log, test_luts[i], strlen(test_luts[i])); + REQUIRE(lut); + + bool has_3dlut = gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100; + if (lut->size[2] && !has_3dlut) { + pl_lut_free(&lut); + continue; + } + + // Test all three at the same time to reduce the number of tests + image.lut = target.lut = params.lut = lut; + + for (enum pl_lut_type t = PL_LUT_UNKNOWN; t <= PL_LUT_CONVERSION; t++) { + printf("testing LUT method %d\n", t); + image.lut_type = target.lut_type = params.lut_type = t; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + } + + image.lut = target.lut = params.lut = NULL; + pl_lut_free(&lut); + } + +#ifdef PL_HAVE_LCMS + + // It doesn't fit without use of 3D textures on GLES2 + if (gpu->glsl.version > 100) { + // Test ICC profiles + image.profile = TEST_PROFILE(sRGB_v2_nano_icc); + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + image.profile = (struct pl_icc_profile) {0}; + + target.profile = TEST_PROFILE(sRGB_v2_nano_icc); + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + target.profile = (struct pl_icc_profile) {0}; + + image.profile = TEST_PROFILE(sRGB_v2_nano_icc); + target.profile = image.profile; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + image.profile = (struct pl_icc_profile) {0}; + target.profile = (struct pl_icc_profile) {0}; + } + +#endif + + // Test overlays + image.num_overlays = 1; + image.overlays = &(struct pl_overlay) { + .tex = img_plane.texture, + .mode = PL_OVERLAY_NORMAL, + .num_parts = 2, + .parts = (struct pl_overlay_part[]) {{ + .src = {0, 0, 2, 2}, + .dst = {30, 100, 40, 200}, + }, { + .src = {2, 2, 5, 5}, + .dst = {1000, -1, 3, 5}, + }}, + }; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + params.disable_fbos = true; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + image.num_overlays = 0; + params = pl_render_default_params; + + target.num_overlays = 1; + target.overlays = &(struct pl_overlay) { + .tex = img_plane.texture, + .mode = PL_OVERLAY_MONOCHROME, + .num_parts = 1, + .parts = &(struct pl_overlay_part) { + .src = {5, 5, 15, 15}, + .dst = {5, 5, 15, 15}, + .color = {1.0, 0.5, 0.0}, + }, + }; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + REQUIRE(pl_render_image(rr, NULL, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + target.num_overlays = 0; + + // Test rotation + for (pl_rotation rot = 0; rot < PL_ROTATION_360; rot += PL_ROTATION_90) { + image.rotation = rot; + REQUIRE(pl_render_image(rr, &image, &target, ¶ms)); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + } + + // Attempt frame mixing, using the mixer queue helper + printf("testing frame mixing \n"); + struct pl_render_params mix_params = { + .frame_mixer = &pl_filter_mitchell_clamp, + .info_callback = render_info_cb, + }; + + struct pl_queue_params qparams = { + .radius = pl_frame_mix_radius(&mix_params), + .vsync_duration = 1.0 / 60.0, + }; + + // Test large PTS jumps in frame mix + struct pl_frame_mix mix = (struct pl_frame_mix) { + .num_frames = 2, + .frames = (const struct pl_frame *[]) { &image, &image }, + .signatures = (uint64_t[]) { 0xFFF1, 0xFFF2 }, + .timestamps = (float[]) { -100, 100 }, + .vsync_duration = 1.6, + }; + REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params)); + + // Test inferring frame mix + inferred_target = target; + pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image); + REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params)); + + // Test empty frame mix + mix = (struct pl_frame_mix) {0}; + REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params)); + + // Test inferring empty frame mix + inferred_target = target; + pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image); + REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params)); + + // Test mixer queue +#define NUM_MIX_FRAMES 20 + const float frame_duration = 1.0 / 24.0; + struct pl_source_frame srcframes[NUM_MIX_FRAMES+1]; + srcframes[NUM_MIX_FRAMES] = (struct pl_source_frame) {0}; + for (int i = 0; i < NUM_MIX_FRAMES; i++) { + srcframes[i] = (struct pl_source_frame) { + .pts = i * frame_duration, + .duration = frame_duration, + .map = frame_passthrough, + .frame_data = &image, + }; + } + + pl_queue queue = pl_queue_create(gpu); + enum pl_queue_status ret; + + // Test pre-pushing all frames, with delayed EOF. + for (int i = 0; i < NUM_MIX_FRAMES; i++) { + const struct pl_source_frame *src = &srcframes[i]; + if (i > 10) // test pushing in reverse order + src = &srcframes[NUM_MIX_FRAMES + 10 - i]; + if (!pl_queue_push_block(queue, 1, src)) // mini-sleep + pl_queue_push(queue, src); // push it anyway, for testing + } + + while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) { + if (ret == PL_QUEUE_MORE) { + REQUIRE_CMP(qparams.pts, >, 0.0f, "f"); + pl_queue_push(queue, NULL); // push delayed EOF + continue; + } + + REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u"); + REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params)); + + // Simulate advancing vsync + qparams.pts += qparams.vsync_duration; + } + + // Test dynamically pulling all frames, with oversample mixer + const struct pl_source_frame *frame_ptr = &srcframes[0]; + mix_params.frame_mixer = &pl_oversample_frame_mixer; + + qparams = (struct pl_queue_params) { + .radius = pl_frame_mix_radius(&mix_params), + .vsync_duration = qparams.vsync_duration, + .get_frame = get_frame_ptr, + .priv = &frame_ptr, + }; + + pl_queue_reset(queue); + while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) { + REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u"); + REQUIRE_CMP(mix.num_frames, <=, 2, "d"); + REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params)); + qparams.pts += qparams.vsync_duration; + } + + // Test large PTS jump + pl_queue_reset(queue); + REQUIRE(pl_queue_update(queue, &mix, &qparams) == PL_QUEUE_EOF); + + // Test deinterlacing + pl_queue_reset(queue); + printf("testing deinterlacing \n"); + for (int i = 0; i < NUM_MIX_FRAMES; i++) { + struct pl_source_frame *src = &srcframes[i]; + if (i > 10) + src = &srcframes[NUM_MIX_FRAMES + 10 - i]; + src->first_field = PL_FIELD_EVEN; + pl_queue_push(queue, src); + } + pl_queue_push(queue, NULL); + + qparams.pts = 0; + qparams.get_frame = NULL; + while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) { + REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u"); + REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params)); + qparams.pts += qparams.vsync_duration; + } + + pl_queue_destroy(&queue); + +error: + pl_renderer_destroy(&rr); + pl_tex_destroy(gpu, &img_tex); + pl_tex_destroy(gpu, &fbo); +} + +static struct pl_hook_res noop_hook(void *priv, const struct pl_hook_params *params) +{ + return (struct pl_hook_res) {0}; +} + +static void pl_ycbcr_tests(pl_gpu gpu) +{ + struct pl_plane_data data[3]; + for (int i = 0; i < 3; i++) { + const int sub = i > 0 ? 1 : 0; + const int width = (323 + sub) >> sub; + const int height = (255 + sub) >> sub; + + data[i] = (struct pl_plane_data) { + .type = PL_FMT_UNORM, + .width = width, + .height = height, + .component_size = {16}, + .component_map = {i}, + .pixel_stride = sizeof(uint16_t), + .row_stride = PL_ALIGN2(width * sizeof(uint16_t), + gpu->limits.align_tex_xfer_pitch), + }; + } + + pl_fmt fmt = pl_plane_find_fmt(gpu, NULL, &data[0]); + enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_HOST_READABLE; + if (!fmt || (fmt->caps & caps) != caps) + return; + + pl_renderer rr = pl_renderer_create(gpu->log, gpu); + if (!rr) + return; + + pl_tex src_tex[3] = {0}; + pl_tex dst_tex[3] = {0}; + struct pl_frame img = { + .num_planes = 3, + .repr = pl_color_repr_hdtv, + .color = pl_color_space_bt709, + }; + + struct pl_frame target = { + .num_planes = 3, + .repr = pl_color_repr_hdtv, + .color = pl_color_space_bt709, + }; + + uint8_t *src_buffer[3] = {0}; + uint8_t *dst_buffer = NULL; + for (int i = 0; i < 3; i++) { + // Generate some arbitrary data for the buffer + src_buffer[i] = malloc(data[i].height * data[i].row_stride); + if (!src_buffer[i]) + goto error; + + data[i].pixels = src_buffer[i]; + for (int y = 0; y < data[i].height; y++) { + for (int x = 0; x < data[i].width; x++) { + size_t off = y * data[i].row_stride + x * data[i].pixel_stride; + uint16_t *pixel = (uint16_t *) &src_buffer[i][off]; + int gx = 200 + 100 * i, gy = 300 + 150 * i; + *pixel = (gx * x) ^ (gy * y); // whatever + } + } + + REQUIRE(pl_upload_plane(gpu, &img.planes[i], &src_tex[i], &data[i])); + } + + // This co-sites chroma pixels with pixels in the RGB image, meaning we + // get an exact round-trip when sampling both ways. This makes it useful + // as a test case, even though it's not common in the real world. + pl_frame_set_chroma_location(&img, PL_CHROMA_TOP_LEFT); + + for (int i = 0; i < 3; i++) { + dst_tex[i] = pl_tex_create(gpu, &(struct pl_tex_params) { + .format = fmt, + .w = data[i].width, + .h = data[i].height, + .renderable = true, + .host_readable = true, + .storable = fmt->caps & PL_FMT_CAP_STORABLE, + .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE, + }); + + if (!dst_tex[i]) + goto error; + + target.planes[i] = img.planes[i]; + target.planes[i].texture = dst_tex[i]; + } + + REQUIRE(pl_render_image(rr, &img, &target, &(struct pl_render_params) { + .num_hooks = 1, + .hooks = &(const struct pl_hook *){&(struct pl_hook) { + // Forces chroma merging, to test the chroma merging code + .stages = PL_HOOK_CHROMA_INPUT, + .hook = noop_hook, + }}, + })); + REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); + + size_t buf_size = data[0].height * data[0].row_stride; + dst_buffer = malloc(buf_size); + if (!dst_buffer) + goto error; + + for (int i = 0; i < 3; i++) { + memset(dst_buffer, 0xAA, buf_size); + REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) { + .tex = dst_tex[i], + .ptr = dst_buffer, + .row_pitch = data[i].row_stride, + })); + + for (int y = 0; y < data[i].height; y++) { + for (int x = 0; x < data[i].width; x++) { + size_t off = y * data[i].row_stride + x * data[i].pixel_stride; + uint16_t *src_pixel = (uint16_t *) &src_buffer[i][off]; + uint16_t *dst_pixel = (uint16_t *) &dst_buffer[off]; + int diff = abs((int) *src_pixel - (int) *dst_pixel); + REQUIRE_CMP(diff, <=, 50, "d"); // a little under 0.1% + } + } + } + +error: + pl_renderer_destroy(&rr); + free(dst_buffer); + for (int i = 0; i < 3; i++) { + free(src_buffer[i]); + pl_tex_destroy(gpu, &src_tex[i]); + pl_tex_destroy(gpu, &dst_tex[i]); + } +} + +static void pl_test_export_import(pl_gpu gpu, + enum pl_handle_type handle_type) +{ + // Test texture roundtrip + + if (!(gpu->export_caps.tex & handle_type) || + !(gpu->import_caps.tex & handle_type)) + goto skip_tex; + + pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 4, 0, 0, PL_FMT_CAP_BLITTABLE); + if (!fmt) + goto skip_tex; + + printf("testing texture import/export with fmt %s\n", fmt->name); + + pl_tex export = pl_tex_create(gpu, &(struct pl_tex_params) { + .w = 32, + .h = 32, + .format = fmt, + .export_handle = handle_type, + }); + REQUIRE(export); + REQUIRE_HANDLE(export->shared_mem, handle_type); + + pl_tex import = pl_tex_create(gpu, &(struct pl_tex_params) { + .w = export->params.w, + .h = export->params.h, + .format = fmt, + .import_handle = handle_type, + .shared_mem = export->shared_mem, + }); + REQUIRE(import); + + pl_tex_destroy(gpu, &import); + pl_tex_destroy(gpu, &export); + +skip_tex: ; + + // Test buffer roundtrip + + if (!(gpu->export_caps.buf & handle_type) || + !(gpu->import_caps.buf & handle_type)) + return; + + printf("testing buffer import/export\n"); + + pl_buf exp_buf = pl_buf_create(gpu, &(struct pl_buf_params) { + .size = 32, + .export_handle = handle_type, + }); + REQUIRE(exp_buf); + REQUIRE_HANDLE(exp_buf->shared_mem, handle_type); + + pl_buf imp_buf = pl_buf_create(gpu, &(struct pl_buf_params) { + .size = 32, + .import_handle = handle_type, + .shared_mem = exp_buf->shared_mem, + }); + REQUIRE(imp_buf); + + pl_buf_destroy(gpu, &imp_buf); + pl_buf_destroy(gpu, &exp_buf); +} + +static void pl_test_host_ptr(pl_gpu gpu) +{ + if (!(gpu->import_caps.buf & PL_HANDLE_HOST_PTR)) + return; + +#ifdef __unix__ + + printf("testing host ptr\n"); + REQUIRE(gpu->limits.max_mapped_size); + + const size_t size = 2 << 20; + const size_t offset = 2 << 10; + const size_t slice = 2 << 16; + + uint8_t *data = aligned_alloc(0x1000, size); + for (int i = 0; i < size; i++) + data[i] = (uint8_t) i; + + pl_buf buf = pl_buf_create(gpu, &(struct pl_buf_params) { + .size = slice, + .import_handle = PL_HANDLE_HOST_PTR, + .shared_mem = { + .handle.ptr = data, + .size = size, + .offset = offset, + }, + .host_mapped = true, + }); + + REQUIRE(buf); + REQUIRE_MEMEQ(data + offset, buf->data, slice); + + pl_buf_destroy(gpu, &buf); + free(data); + +#endif // unix +} + +static void gpu_shader_tests(pl_gpu gpu) +{ + pl_buffer_tests(gpu); + pl_texture_tests(gpu); + pl_planar_tests(gpu); + pl_shader_tests(gpu); + pl_scaler_tests(gpu); + pl_render_tests(gpu); + pl_ycbcr_tests(gpu); + + REQUIRE(!pl_gpu_is_failed(gpu)); +} + +static void gpu_interop_tests(pl_gpu gpu) +{ + pl_test_export_import(gpu, PL_HANDLE_DMA_BUF); + pl_test_host_ptr(gpu); + + REQUIRE(!pl_gpu_is_failed(gpu)); +} diff --git a/src/tests/icc.c b/src/tests/icc.c new file mode 100644 index 0000000..188940b --- /dev/null +++ b/src/tests/icc.c @@ -0,0 +1,106 @@ +#include "tests.h" + +#include <libplacebo/shaders/icc.h> + +static const uint8_t DisplayP3_v2_micro_icc[] = { + 0x00, 0x00, 0x01, 0xc8, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00, + 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20, + 0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d, + 0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64, + 0xb4, 0xaa, 0xdd, 0x1f, 0x13, 0xc8, 0x03, 0x3c, 0xf5, 0x51, 0x14, 0x45, + 0x28, 0x7a, 0x98, 0xe2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5e, + 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x0c, + 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x18, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x2c, 0x00, 0x00, 0x00, 0x14, + 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, + 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x54, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60, + 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60, + 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, + 0x75, 0x50, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00, + 0x43, 0x43, 0x30, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xf3, 0x51, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xcc, + 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x83, 0xdf, + 0x00, 0x00, 0x3d, 0xbf, 0xff, 0xff, 0xff, 0xbb, 0x58, 0x59, 0x5a, 0x20, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, 0xbf, 0x00, 0x00, 0xb1, 0x37, + 0x00, 0x00, 0x0a, 0xb9, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x28, 0x38, 0x00, 0x00, 0x11, 0x0a, 0x00, 0x00, 0xc8, 0xb9, + 0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a, + 0x00, 0x00, 0x00, 0x7c, 0x00, 0xf8, 0x01, 0x9c, 0x02, 0x75, 0x03, 0x83, + 0x04, 0xc9, 0x06, 0x4e, 0x08, 0x12, 0x0a, 0x18, 0x0c, 0x62, 0x0e, 0xf4, + 0x11, 0xcf, 0x14, 0xf6, 0x18, 0x6a, 0x1c, 0x2e, 0x20, 0x43, 0x24, 0xac, + 0x29, 0x6a, 0x2e, 0x7e, 0x33, 0xeb, 0x39, 0xb3, 0x3f, 0xd6, 0x46, 0x57, + 0x4d, 0x36, 0x54, 0x76, 0x5c, 0x17, 0x64, 0x1d, 0x6c, 0x86, 0x75, 0x56, + 0x7e, 0x8d, 0x88, 0x2c, 0x92, 0x36, 0x9c, 0xab, 0xa7, 0x8c, 0xb2, 0xdb, + 0xbe, 0x99, 0xca, 0xc7, 0xd7, 0x65, 0xe4, 0x77, 0xf1, 0xf9, 0xff, 0xff +}; + +static const uint8_t Rec2020_v2_micro_icc[] = { + 0x00, 0x00, 0x01, 0xcc, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00, + 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20, + 0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d, + 0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64, + 0x17, 0xcb, 0x44, 0xd1, 0x0d, 0xca, 0xe1, 0xc9, 0x03, 0x3e, 0x20, 0x85, + 0x4a, 0x67, 0x4e, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5f, + 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x0c, + 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x18, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x2c, 0x00, 0x00, 0x00, 0x14, + 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, + 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x54, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64, + 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64, + 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, + 0x75, 0x32, 0x30, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00, + 0x43, 0x43, 0x30, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xf3, 0x51, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xcc, + 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x69, + 0x00, 0x00, 0x47, 0x70, 0xff, 0xff, 0xff, 0x81, 0x58, 0x59, 0x5a, 0x20, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a, 0x6a, 0x00, 0x00, 0xac, 0xe3, + 0x00, 0x00, 0x07, 0xad, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x20, 0x03, 0x00, 0x00, 0x0b, 0xad, 0x00, 0x00, 0xcb, 0xff, + 0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, + 0x00, 0x00, 0x01, 0x53, 0x02, 0xa5, 0x03, 0xf8, 0x05, 0x4e, 0x06, 0xd6, + 0x08, 0x98, 0x0a, 0x8f, 0x0c, 0xc3, 0x0f, 0x31, 0x11, 0xdc, 0x14, 0xc3, + 0x17, 0xe8, 0x1b, 0x4c, 0x1e, 0xf0, 0x22, 0xd5, 0x26, 0xfa, 0x2b, 0x62, + 0x30, 0x0c, 0x34, 0xfa, 0x3a, 0x2b, 0x3f, 0xa2, 0x45, 0x5d, 0x4b, 0x5f, + 0x51, 0xa7, 0x58, 0x37, 0x5f, 0x0d, 0x66, 0x2c, 0x6d, 0x94, 0x75, 0x45, + 0x7d, 0x3f, 0x85, 0x84, 0x8e, 0x13, 0x96, 0xee, 0xa0, 0x13, 0xa9, 0x86, + 0xb3, 0x44, 0xbd, 0x4f, 0xc7, 0xa8, 0xd2, 0x4e, 0xdd, 0x42, 0xe8, 0x86, + 0xf4, 0x16, 0xff, 0xff +}; + +int main() +{ + pl_log log = pl_test_logger(); + pl_icc_object icc; + + icc = pl_icc_open(log, &TEST_PROFILE(sRGB_v2_nano_icc), NULL); + REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_BT_709, "u"); + pl_icc_close(&icc); + + icc = pl_icc_open(log, &TEST_PROFILE(DisplayP3_v2_micro_icc), NULL); + REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_DISPLAY_P3, "u"); + pl_icc_close(&icc); + + icc = pl_icc_open(log, &TEST_PROFILE(Rec2020_v2_micro_icc), NULL); + REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_BT_2020, "u"); + pl_icc_close(&icc); + + pl_log_destroy(&log); +} diff --git a/src/tests/include/include_tmpl.c b/src/tests/include/include_tmpl.c new file mode 100644 index 0000000..dd1000e --- /dev/null +++ b/src/tests/include/include_tmpl.c @@ -0,0 +1 @@ +#include <libplacebo/@header@> diff --git a/src/tests/include/include_tmpl.cpp b/src/tests/include/include_tmpl.cpp new file mode 100644 index 0000000..2b6334c --- /dev/null +++ b/src/tests/include/include_tmpl.cpp @@ -0,0 +1,3 @@ +#define PL_LIBAV_IMPLEMENTATION 0 +#define PL_DAV1D_IMPLEMENTATION 0 +#include <libplacebo/@header@> diff --git a/src/tests/include/meson.build b/src/tests/include/meson.build new file mode 100644 index 0000000..25dfaee --- /dev/null +++ b/src/tests/include/meson.build @@ -0,0 +1,35 @@ +include_tmpl_langs = ['c', 'cpp'] + +# Ensure all headers compile + +test_include_sources = [] +foreach h : headers + + if (h.contains('internal') or + h.contains('dav1d') and not dav1d.found() or + h.contains('libav') and not libav_found or + h.contains('d3d11') and not d3d11_header) + continue + endif + + foreach lang : include_tmpl_langs + + test_include_sources += configure_file( + input: 'include_tmpl.' + lang, + output: 'include_@0@.@1@'.format(h.underscorify(), lang), + configuration: { + 'header': h + }, + ) + + endforeach + +endforeach + +static_library('test_include', test_include_sources, + dependencies: [tdep_static, lavu, lavc, lavf], + include_directories: [inc, vulkan_headers_inc], + implicit_include_directories: false, + c_args: ['-Wall', '-Wextra', '-Wpedantic'], + cpp_args: ['-Wall', '-Wextra', '-Wpedantic'], +) diff --git a/src/tests/libav.c b/src/tests/libav.c new file mode 100644 index 0000000..7c91e85 --- /dev/null +++ b/src/tests/libav.c @@ -0,0 +1,393 @@ +#include "tests.h" +#include "libplacebo/utils/libav.h" + +int main() +{ + struct pl_plane_data data[4] = {0}; + struct pl_bit_encoding bits; + + // Make sure we don't crash on any av pixfmt + const AVPixFmtDescriptor *desc = NULL; + while ((desc = av_pix_fmt_desc_next(desc))) + pl_plane_data_from_pixfmt(data, &bits, av_pix_fmt_desc_get_id(desc)); + +#define TEST(pixfmt, reference) \ + do { \ + int planes = pl_plane_data_from_pixfmt(data, &bits, pixfmt); \ + REQUIRE_CMP(planes, ==, sizeof(reference) / sizeof(*reference), "d"); \ + REQUIRE_MEMEQ(data, reference, sizeof(reference)); \ + } while (0) + + // Planar and semiplanar formats + static const struct pl_plane_data yuvp8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {0}, + .pixel_stride = 1, + }, { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {1}, + .pixel_stride = 1, + }, { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {2}, + .pixel_stride = 1, + } + }; + + TEST(AV_PIX_FMT_YUV420P, yuvp8); + TEST(AV_PIX_FMT_YUV422P, yuvp8); + TEST(AV_PIX_FMT_YUV444P, yuvp8); + TEST(AV_PIX_FMT_YUV410P, yuvp8); + TEST(AV_PIX_FMT_YUV411P, yuvp8); + TEST(AV_PIX_FMT_YUV440P, yuvp8); + + static const struct pl_plane_data yuvap8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {0}, + .pixel_stride = 1, + }, { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {1}, + .pixel_stride = 1, + }, { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {2}, + .pixel_stride = 1, + }, { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {3}, + .pixel_stride = 1, + } + }; + + TEST(AV_PIX_FMT_YUVA420P, yuvap8); + + static const struct pl_plane_data yuvp16[] = { + { + .type = PL_FMT_UNORM, + .component_size = {16}, + .component_map = {0}, + .pixel_stride = 2, + }, { + .type = PL_FMT_UNORM, + .component_size = {16}, + .component_map = {1}, + .pixel_stride = 2, + }, { + .type = PL_FMT_UNORM, + .component_size = {16}, + .component_map = {2}, + .pixel_stride = 2, + } + }; + + TEST(AV_PIX_FMT_YUV420P10LE, yuvp16); + TEST(AV_PIX_FMT_YUV420P16LE, yuvp16); + + static const struct pl_plane_data nv12[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {0}, + .pixel_stride = 1, + }, { + .type = PL_FMT_UNORM, + .component_size = {8, 8}, + .component_map = {1, 2}, + .pixel_stride = 2, + } + }; + + TEST(AV_PIX_FMT_NV12, nv12); + + static const struct pl_plane_data nv21[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {0}, + .pixel_stride = 1, + }, { + .type = PL_FMT_UNORM, + .component_size = {8, 8}, + .component_map = {2, 1}, + .pixel_stride = 2, + } + }; + + TEST(AV_PIX_FMT_NV21, nv21); + + static const struct pl_plane_data p016[] = { + { + .type = PL_FMT_UNORM, + .component_size = {16}, + .component_map = {0}, + .pixel_stride = 2, + }, { + .type = PL_FMT_UNORM, + .component_size = {16, 16}, + .component_map = {1, 2}, + .pixel_stride = 4, + } + }; + + TEST(AV_PIX_FMT_P010LE, p016); + TEST(AV_PIX_FMT_P016LE, p016); + + // Packed formats + static const struct pl_plane_data r8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8}, + .component_map = {0}, + .pixel_stride = 1, + } + }; + + TEST(AV_PIX_FMT_GRAY8, r8); + + static const struct pl_plane_data rg8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8}, + .component_map = {0, 1}, + .pixel_stride = 2, + } + }; + + TEST(AV_PIX_FMT_GRAY8A, rg8); + + static const struct pl_plane_data rgb8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8}, + .component_map = {0, 1, 2}, + .pixel_stride = 3, + } + }; + + TEST(AV_PIX_FMT_RGB24, rgb8); + + static const struct pl_plane_data bgr8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8}, + .component_map = {2, 1, 0}, + .pixel_stride = 3, + } + }; + + TEST(AV_PIX_FMT_BGR24, bgr8); + + static const struct pl_plane_data rgbx8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8}, + .component_map = {0, 1, 2}, + .pixel_stride = 4, + } + }; + + TEST(AV_PIX_FMT_RGB0, rgbx8); + + static const struct pl_plane_data xrgb8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8}, + .component_map = {0, 1, 2}, + .component_pad = {8, 0, 0}, + .pixel_stride = 4, + } + }; + + TEST(AV_PIX_FMT_0RGB, xrgb8); + + static const struct pl_plane_data rgba8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8, 8}, + .component_map = {0, 1, 2, 3}, + .pixel_stride = 4, + } + }; + + TEST(AV_PIX_FMT_RGBA, rgba8); + + static const struct pl_plane_data argb8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8, 8}, + .component_map = {3, 0, 1, 2}, + .pixel_stride = 4, + } + }; + + TEST(AV_PIX_FMT_ARGB, argb8); + + static const struct pl_plane_data bgra8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8, 8}, + .component_map = {2, 1, 0, 3}, + .pixel_stride = 4, + } + }; + + TEST(AV_PIX_FMT_BGRA, bgra8); + + static const struct pl_plane_data abgr8[] = { + { + .type = PL_FMT_UNORM, + .component_size = {8, 8, 8, 8}, + .component_map = {3, 2, 1, 0}, + .pixel_stride = 4, + } + }; + + TEST(AV_PIX_FMT_ABGR, abgr8); + + static const struct pl_plane_data r16[] = { + { + .type = PL_FMT_UNORM, + .component_size = {16}, + .component_map = {0}, + .pixel_stride = 2, + } + }; + + TEST(AV_PIX_FMT_GRAY16LE, r16); + + static const struct pl_plane_data rgb16[] = { + { + .type = PL_FMT_UNORM, + .component_size = {16, 16, 16}, + .component_map = {0, 1, 2}, + .pixel_stride = 6, + } + }; + + TEST(AV_PIX_FMT_RGB48LE, rgb16); + + static const struct pl_plane_data rgb16be[] = { + { + .type = PL_FMT_UNORM, + .component_size = {16, 16, 16}, + .component_map = {0, 1, 2}, + .pixel_stride = 6, + .swapped = true, + } + }; + + TEST(AV_PIX_FMT_RGB48BE, rgb16be); + + static const struct pl_plane_data rgba16[] = { + { + .type = PL_FMT_UNORM, + .component_size = {16, 16, 16, 16}, + .component_map = {0, 1, 2, 3}, + .pixel_stride = 8, + } + }; + + TEST(AV_PIX_FMT_RGBA64LE, rgba16); + + static const struct pl_plane_data rgba16be[] = { + { + .type = PL_FMT_UNORM, + .component_size = {16, 16, 16, 16}, + .component_map = {0, 1, 2, 3}, + .pixel_stride = 8, + .swapped = true, + } + }; + + TEST(AV_PIX_FMT_RGBA64BE, rgba16be); + + static const struct pl_plane_data rgb565[] = { + { + .type = PL_FMT_UNORM, + .component_size = {5, 6, 5}, + .component_map = {2, 1, 0}, // LSB to MSB + .pixel_stride = 2, + } + }; + + TEST(AV_PIX_FMT_RGB565LE, rgb565); + +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 37, 100) + + static const struct pl_plane_data rgb32f[] = { + { + .type = PL_FMT_FLOAT, + .component_size = {32, 32, 32}, + .component_map = {0, 1, 2}, + .pixel_stride = 12, + } + }; + + TEST(AV_PIX_FMT_RGBF32LE, rgb32f); + +#endif + + // Test pl_frame <- AVFrame bridge + struct pl_frame image; + AVFrame *frame = av_frame_alloc(); + frame->format = AV_PIX_FMT_RGBA; + pl_frame_from_avframe(&image, frame); + REQUIRE_CMP(image.num_planes, ==, 1, "d"); + REQUIRE_CMP(image.repr.sys, ==, PL_COLOR_SYSTEM_RGB, "u"); + + // Test inverse mapping + struct pl_color_space csp = image.color; + pl_color_space_infer(&csp); + pl_avframe_set_color(frame, csp); + pl_avframe_set_repr(frame, image.repr); + pl_avframe_set_profile(frame, image.profile); + pl_frame_from_avframe(&image, frame); + pl_color_space_infer(&image.color); + REQUIRE(pl_color_space_equal(&csp, &image.color)); + av_frame_free(&frame); + + // Test enum functions + for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) { + enum AVColorSpace spc = pl_system_to_av(sys); + enum pl_color_system sys2 = pl_system_from_av(spc); + // Exception to the rule, due to different handling in libav* + if (sys2 && sys != PL_COLOR_SYSTEM_BT_2100_HLG) + REQUIRE_CMP(sys, ==, sys2, "u"); + } + + for (enum pl_color_levels lev = 0; lev < PL_COLOR_LEVELS_COUNT; lev++) { + enum AVColorRange range = pl_levels_to_av(lev); + enum pl_color_levels lev2 = pl_levels_from_av(range); + REQUIRE_CMP(lev, ==, lev2, "u"); + } + + for (enum pl_color_primaries prim = 0; prim < PL_COLOR_PRIM_COUNT; prim++) { + enum AVColorPrimaries avpri = pl_primaries_to_av(prim); + enum pl_color_primaries prim2 = pl_primaries_from_av(avpri); + if (prim2) + REQUIRE_CMP(prim, ==, prim2, "u"); + } + + for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) { + enum AVColorTransferCharacteristic avtrc = pl_transfer_to_av(trc); + enum pl_color_transfer trc2 = pl_transfer_from_av(avtrc); + if (trc2) + REQUIRE_CMP(trc, ==, trc2, "u"); + } + + for (enum pl_chroma_location loc = 0; loc < PL_CHROMA_COUNT; loc++) { + enum AVChromaLocation avloc = pl_chroma_to_av(loc); + enum pl_chroma_location loc2 = pl_chroma_from_av(avloc); + REQUIRE_CMP(loc, ==, loc2, "u"); + } +} diff --git a/src/tests/lut.c b/src/tests/lut.c new file mode 100644 index 0000000..4af44ee --- /dev/null +++ b/src/tests/lut.c @@ -0,0 +1,86 @@ +#include "tests.h" + +#include <libplacebo/dummy.h> +#include <libplacebo/shaders/lut.h> + +static const char *luts[] = { + + "TITLE \"1D LUT example\" \n" + "LUT_1D_SIZE 11 \n" + "# Random comment \n" + "0.0 0.0 0.0 \n" + "0.1 0.1 0.1 \n" + "0.2 0.2 0.2 \n" + "0.3 0.3 0.3 \n" + "0.4 0.4 0.4 \n" + "0.5 0.5 0.5 \n" + "0.6 0.6 0.6 \n" + "0.7 0.7 0.7 \n" + "0.8 0.8 0.8 \n" + "0.9 0.9 0.9 \n" + "0.10 0.10 0.10 \n", + + "LUT_3D_SIZE 3 \n" + "TITLE \"3D LUT example\" \n" + "0.0 0.0 0.0 \n" + "0.5 0.0 0.0 \n" + "1.0 0.0 0.0 \n" + "0.0 0.5 0.0 \n" + "0.5 0.5 0.0 \n" + "1.0 0.5 0.0 \n" + "0.0 1.0 0.0 \n" + "0.5 1.0 0.0 \n" + "1.0 1.0 0.0 \n" + "0.0 0.0 0.5 \n" + "0.5 0.0 0.5 \n" + "1.0 0.0 0.5 \n" + "0.0 0.5 0.5 \n" + "0.5 0.5 0.5 \n" + "1.0 0.5 0.5 \n" + "0.0 1.0 0.5 \n" + "0.5 1.0 0.5 \n" + "1.0 1.0 0.5 \n" + "0.0 0.0 1.0 \n" + "0.5 0.0 1.0 \n" + "1.0 0.0 1.0 \n" + "0.0 0.5 1.0 \n" + "0.5 0.5 1.0 \n" + "1.0 0.5 1.0 \n" + "0.0 1.0 1.0 \n" + "0.5 1.0 1.0 \n" + "1.0 1.0 1.0 \n", + + "LUT_1D_SIZE 3 \n" + "TITLE \"custom domain\" \n" + "DOMAIN_MAX 255 255 255 \n" + "0 0 0 \n" + "128 128 128 \n" + "255 255 255 \n" + +}; + +int main() +{ + pl_log log = pl_test_logger(); + pl_gpu gpu = pl_gpu_dummy_create(log, NULL); + pl_shader sh = pl_shader_alloc(log, NULL); + pl_shader_obj obj = NULL; + + for (int i = 0; i < PL_ARRAY_SIZE(luts); i++) { + struct pl_custom_lut *lut; + lut = pl_lut_parse_cube(log, luts[i], strlen(luts[i])); + REQUIRE(lut); + + pl_shader_reset(sh, pl_shader_params( .gpu = gpu )); + pl_shader_custom_lut(sh, lut, &obj); + const struct pl_shader_res *res = pl_shader_finalize(sh); + REQUIRE(res); + printf("Generated LUT shader:\n%s\n", res->glsl); + pl_lut_free(&lut); + } + + pl_shader_obj_destroy(&obj); + pl_shader_free(&sh); + pl_gpu_dummy_destroy(&gpu); + pl_log_destroy(&log); +} diff --git a/src/tests/meson.build b/src/tests/meson.build new file mode 100644 index 0000000..335c6b1 --- /dev/null +++ b/src/tests/meson.build @@ -0,0 +1,39 @@ +ts = [] + +foreach t : tests + deps = [tdep_static] + if t == 'opengl_surfaceless.c' + deps += glad_dep + endif + # TODO: Define objects in tdep_static once Meson 1.1.0 is ok to use + ts += { 'source': t, + 'deps': deps, + 'objects': lib.extract_all_objects(recursive: false) } +endforeach + +dav1d = dependency('dav1d', required: false) +if dav1d.found() + ts += { 'source': 'dav1d.c', 'deps': [dav1d, tdep_shared] } +endif + +lavu = dependency('libavutil', version: '>=55.74.100', required: false) +lavc = dependency('libavcodec', required: false) +lavf = dependency('libavformat', required: false) +libav_found = lavu.found() and lavc.found() and lavf.found() +if libav_found + ts += { 'source': 'libav.c', 'deps': [lavu, lavc, lavf, tdep_shared] } +endif + +foreach t : ts + e = executable('test.' + t['source'], t['source'], + objects: t.get('objects', []), + c_args: [ '-Wno-unused-function' ], + dependencies: t.get('deps', []), + link_args: link_args, + link_depends: link_depends, + ) + + test(t['source'], e, timeout: 120) +endforeach + +subdir('include') diff --git a/src/tests/opengl_surfaceless.c b/src/tests/opengl_surfaceless.c new file mode 100644 index 0000000..2d12a08 --- /dev/null +++ b/src/tests/opengl_surfaceless.c @@ -0,0 +1,247 @@ +#include "gpu_tests.h" +#include "opengl/utils.h" + +#include <libplacebo/opengl.h> + +static void opengl_interop_tests(pl_gpu gpu) +{ + pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 1, 0, 0, + PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_LINEAR); + if (!fmt) + return; + + pl_tex export = pl_tex_create(gpu, pl_tex_params( + .w = 32, + .h = 32, + .format = fmt, + .sampleable = true, + .renderable = true, + .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE, + )); + + REQUIRE(export); + + struct pl_opengl_wrap_params wrap = { + .width = export->params.w, + .height = export->params.h, + .depth = export->params.d, + }; + + wrap.texture = pl_opengl_unwrap(gpu, export, &wrap.target, &wrap.iformat, NULL); + REQUIRE(wrap.texture); + + pl_tex import = pl_opengl_wrap(gpu, &wrap); + REQUIRE(import); + REQUIRE(import->params.renderable); + REQUIRE_CMP(import->params.blit_dst, ==, export->params.blit_dst, "d"); + + pl_tex_destroy(gpu, &import); + pl_tex_destroy(gpu, &export); +} + +#define PBUFFER_WIDTH 640 +#define PBUFFER_HEIGHT 480 + +struct swapchain_priv { + EGLDisplay display; + EGLSurface surface; +}; + +static void swap_buffers(void *priv) +{ + struct swapchain_priv *p = priv; + eglSwapBuffers(p->display, p->surface); +} + +static void opengl_swapchain_tests(pl_opengl gl, + EGLDisplay display, EGLSurface surface) +{ + if (surface == EGL_NO_SURFACE) + return; + + printf("testing opengl swapchain\n"); + pl_gpu gpu = gl->gpu; + pl_swapchain sw; + sw = pl_opengl_create_swapchain(gl, pl_opengl_swapchain_params( + .swap_buffers = swap_buffers, + .priv = &(struct swapchain_priv) { display, surface }, + )); + REQUIRE(sw); + + int w = PBUFFER_WIDTH, h = PBUFFER_HEIGHT; + REQUIRE(pl_swapchain_resize(sw, &w, &h)); + + for (int i = 0; i < 10; i++) { + struct pl_swapchain_frame frame; + REQUIRE(pl_swapchain_start_frame(sw, &frame)); + if (frame.fbo->params.blit_dst) + pl_tex_clear(gpu, frame.fbo, (float[4]){0}); + + // TODO: test this with an actual pl_renderer instance + struct pl_frame target; + pl_frame_from_swapchain(&target, &frame); + + REQUIRE(pl_swapchain_submit_frame(sw)); + pl_swapchain_swap_buffers(sw); + } + + pl_swapchain_destroy(&sw); +} + +int main() +{ + if (!gladLoaderLoadEGL(EGL_NO_DISPLAY)) + return SKIP; + + const char *extstr = eglQueryString(EGL_NO_DISPLAY, EGL_EXTENSIONS); + if (!extstr || !strstr(extstr, "EGL_MESA_platform_surfaceless")) + return SKIP; + + // Create the OpenGL context + EGLDisplay dpy = eglGetPlatformDisplayEXT(EGL_PLATFORM_SURFACELESS_MESA, + (void *) EGL_DEFAULT_DISPLAY, NULL); + if (dpy == EGL_NO_DISPLAY) + return SKIP; + + EGLint major, minor; + if (!eglInitialize(dpy, &major, &minor)) + return SKIP; + + if (!gladLoaderLoadEGL(dpy)) + return SKIP; + + printf("Initialized EGL v%d.%d\n", major, minor); + int egl_ver = major * 10 + minor; + + struct { + EGLenum api; + EGLenum render; + int major, minor; + int glsl_ver; + EGLenum profile; + } egl_vers[] = { + { EGL_OPENGL_API, EGL_OPENGL_BIT, 4, 6, 460, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT }, + { EGL_OPENGL_API, EGL_OPENGL_BIT, 3, 3, 330, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT }, + { EGL_OPENGL_API, EGL_OPENGL_BIT, 3, 0, 130, EGL_CONTEXT_OPENGL_COMPATIBILITY_PROFILE_BIT, }, + { EGL_OPENGL_ES_API, EGL_OPENGL_ES3_BIT, 3, 0, 300, }, + }; + + struct pl_glsl_version last_glsl = {0}; + struct pl_gpu_limits last_limits = {0}; + + pl_log log = pl_test_logger(); + + for (int i = 0; i < PL_ARRAY_SIZE(egl_vers); i++) { + + const int cfg_attribs[] = { + EGL_SURFACE_TYPE, EGL_PBUFFER_BIT, + EGL_RENDERABLE_TYPE, egl_vers[i].render, + EGL_NONE + }; + + EGLConfig config = 0; + EGLint num_configs = 0; + bool ok = eglChooseConfig(dpy, cfg_attribs, &config, 1, &num_configs); + if (!ok || !num_configs) + goto error; + + if (!eglBindAPI(egl_vers[i].api)) + goto error; + + EGLContext egl; + if (egl_vers[i].api == EGL_OPENGL_ES_API) { + // OpenGL ES + const EGLint egl_attribs[] = { + EGL_CONTEXT_CLIENT_VERSION, egl_vers[i].major, + (egl_ver >= 15) ? EGL_CONTEXT_OPENGL_DEBUG : EGL_NONE, EGL_TRUE, + EGL_NONE + }; + + printf("Attempting creation of OpenGL ES v%d context\n", egl_vers[i].major); + egl = eglCreateContext(dpy, config, EGL_NO_CONTEXT, egl_attribs); + } else { + // Desktop OpenGL + const int egl_attribs[] = { + EGL_CONTEXT_MAJOR_VERSION, egl_vers[i].major, + EGL_CONTEXT_MINOR_VERSION, egl_vers[i].minor, + EGL_CONTEXT_OPENGL_PROFILE_MASK, egl_vers[i].profile, + (egl_ver >= 15) ? EGL_CONTEXT_OPENGL_DEBUG : EGL_NONE, EGL_TRUE, + EGL_NONE + }; + + printf("Attempting creation of Desktop OpenGL v%d.%d context\n", + egl_vers[i].major, egl_vers[i].minor); + egl = eglCreateContext(dpy, config, EGL_NO_CONTEXT, egl_attribs); + } + + if (!egl) + goto error; + + const EGLint pbuffer_attribs[] = { + EGL_WIDTH, PBUFFER_WIDTH, + EGL_HEIGHT, PBUFFER_HEIGHT, + EGL_NONE + }; + + EGLSurface surf = eglCreatePbufferSurface(dpy, config, pbuffer_attribs); + + if (!eglMakeCurrent(dpy, surf, surf, egl)) + goto error; + + pl_opengl gl = pl_opengl_create(log, pl_opengl_params( + .get_proc_addr = (pl_voidfunc_t (*)(const char *)) eglGetProcAddress, + .max_glsl_version = egl_vers[i].glsl_ver, + .debug = true, + .egl_display = dpy, + .egl_context = egl, +#ifdef CI_ALLOW_SW + .allow_software = true, +#endif + )); + if (!gl) + goto next; + + // Skip repeat tests + pl_gpu gpu = gl->gpu; + if (memcmp(&last_glsl, &gpu->glsl, sizeof(last_glsl)) == 0 && + memcmp(&last_limits, &gpu->limits, sizeof(last_limits)) == 0) + { + printf("Skipping tests due to duplicate capabilities/version\n"); + goto next; + } + +#ifdef CI_MAXGL + if (last_glsl.version && last_glsl.gles == gpu->glsl.gles) + goto next; +#endif + + last_glsl = gpu->glsl; + last_limits = gpu->limits; + + gpu_shader_tests(gpu); + gpu_interop_tests(gpu); + opengl_interop_tests(gpu); + opengl_swapchain_tests(gl, dpy, surf); + + // Reduce log spam after first successful test + pl_log_level_update(log, PL_LOG_INFO); + +next: + pl_opengl_destroy(&gl); + eglDestroySurface(dpy, surf); + eglDestroyContext(dpy, egl); + continue; + +error: ; + EGLint error = eglGetError(); + if (error != EGL_SUCCESS) + fprintf(stderr, "EGL error: %s\n", egl_err_str(error)); + } + + eglTerminate(dpy); + gladLoaderUnloadEGL(); + pl_log_destroy(&log); + + if (!last_glsl.version) + return SKIP; +} diff --git a/src/tests/options.c b/src/tests/options.c new file mode 100644 index 0000000..f178668 --- /dev/null +++ b/src/tests/options.c @@ -0,0 +1,123 @@ +#include "tests.h" + +#include <libplacebo/options.h> + +static void count_cb(void *priv, pl_opt_data data) +{ + int *num = priv; + printf("Iterating over option: %s = %s\n", data->opt->key, data->text); + (*num)++; +} + +static void set_cb(void *priv, pl_opt_data data) +{ + pl_options dst = priv; + REQUIRE(pl_options_set_str(dst, data->opt->key, data->text)); +} + +int main() +{ + pl_log log = pl_test_logger(); + pl_options test = pl_options_alloc(log); + + REQUIRE_STREQ(pl_options_save(test), ""); + REQUIRE(pl_options_load(test, "")); + REQUIRE_STREQ(pl_options_save(test), ""); + + pl_options_reset(test, &pl_render_fast_params); + REQUIRE_STREQ(pl_options_save(test), ""); + REQUIRE(pl_options_load(test, "preset=fast")); + REQUIRE_STREQ(pl_options_save(test), ""); + + const char *def_opts = "upscaler=lanczos,downscaler=hermite,frame_mixer=oversample,sigmoid=yes,peak_detect=yes,dither=yes"; + pl_options_reset(test, &pl_render_default_params); + REQUIRE_STREQ(pl_options_save(test), def_opts); + struct pl_options_t def_pre = *test; + pl_options_reset(test, NULL); + REQUIRE_STREQ(pl_options_save(test), ""); + REQUIRE(pl_options_load(test, def_opts)); + REQUIRE_STREQ(pl_options_save(test), def_opts); + REQUIRE_MEMEQ(test, &def_pre, sizeof(*test)); + pl_options_reset(test, NULL); + REQUIRE(pl_options_load(test, "preset=default")); + REQUIRE_STREQ(pl_options_save(test), def_opts); + REQUIRE_MEMEQ(test, &def_pre, sizeof(*test)); + + int num = 0; + pl_options_iterate(test, count_cb, &num); + REQUIRE_CMP(num, ==, 6, "d"); + + pl_opt_data data; + REQUIRE((data = pl_options_get(test, "tile_size"))); + REQUIRE_STREQ(data->opt->key, "tile_size"); + REQUIRE_CMP(*(int *) data->value, =, pl_render_default_params.tile_size, "d"); + REQUIRE_STREQ(data->text, "32"); + + const char *hq_opts = "upscaler=ewa_lanczossharp,downscaler=hermite,frame_mixer=oversample,deband=yes,sigmoid=yes,peak_detect=yes,peak_percentile=99.99500274658203,contrast_recovery=0.30000001192092896,dither=yes"; + // fallback can produce different precision + const char *hq_opts2 = "upscaler=ewa_lanczossharp,downscaler=hermite,frame_mixer=oversample,deband=yes,sigmoid=yes,peak_detect=yes,peak_percentile=99.99500274658203125,contrast_recovery=0.30000001192092896,dither=yes"; + + pl_options_reset(test, &pl_render_high_quality_params); + const char *opts = pl_options_save(test); + if (!strcmp(opts, hq_opts2)) + hq_opts = hq_opts2; + REQUIRE_STREQ(opts, hq_opts); + struct pl_options_t hq_pre = *test; + pl_options_reset(test, NULL); + REQUIRE_STREQ(pl_options_save(test), ""); + REQUIRE(pl_options_load(test, hq_opts)); + REQUIRE_STREQ(pl_options_save(test), hq_opts); + REQUIRE_MEMEQ(test, &hq_pre, sizeof(*test)); + REQUIRE(pl_options_load(test, "preset=high_quality")); + REQUIRE_STREQ(pl_options_save(test), hq_opts); + REQUIRE_MEMEQ(test, &hq_pre, sizeof(*test)); + + pl_options test2 = pl_options_alloc(log); + pl_options_iterate(test, set_cb, test2); + REQUIRE_STREQ(pl_options_save(test), pl_options_save(test2)); + pl_options_free(&test2); + + // Test custom scalers + pl_options_reset(test, pl_render_params( + .upscaler = &(struct pl_filter_config) { + .kernel = &pl_filter_function_jinc, + .window = &pl_filter_function_jinc, + .radius = 4.0, + .polar = true, + }, + )); + const char *jinc4_opts = "upscaler=custom,upscaler_kernel=jinc,upscaler_window=jinc,upscaler_radius=4,upscaler_polar=yes"; + REQUIRE_STREQ(pl_options_save(test), jinc4_opts); + struct pl_options_t jinc4_pre = *test; + pl_options_reset(test, NULL); + REQUIRE(pl_options_load(test, "upscaler=custom,upscaler_preset=ewa_lanczos,upscaler_radius=4.0,upscaler_clamp=0.0")); + REQUIRE_STREQ(pl_options_save(test), jinc4_opts); + REQUIRE_MEMEQ(test, &jinc4_pre, sizeof(*test)); + + // Test params presets + pl_options_reset(test, NULL); + REQUIRE(pl_options_load(test, "cone=yes,cone_preset=deuteranomaly")); + REQUIRE_STREQ(pl_options_save(test), "cone=yes,cones=m,cone_strength=0.5"); + + // Test error paths + pl_options bad = pl_options_alloc(NULL); + REQUIRE(!pl_options_load(bad, "scale_preset=help")); + REQUIRE(!pl_options_load(bad, "dither_method=invalid")); + REQUIRE(!pl_options_load(bad, "lut_entries=-1")); + REQUIRE(!pl_options_load(bad, "deband_iterations=100")); + REQUIRE(!pl_options_load(bad, "tone_lut_size=abc")); + REQUIRE(!pl_options_load(bad, "show_clipping=hello")); + REQUIRE(!pl_options_load(bad, "brightness=2.0")); + REQUIRE(!pl_options_load(bad, "gamma=oops")); + REQUIRE(!pl_options_load(bad, "invalid")); + REQUIRE(!pl_options_load(bad, "=")); + REQUIRE(!pl_options_load(bad, "preset==bar")); + REQUIRE(!pl_options_load(bad, "peak_percentile=E8203125")); + REQUIRE(!pl_options_get(bad, "invalid")); + REQUIRE_STREQ(pl_options_save(bad), ""); + pl_options_free(&bad); + + pl_options_free(&test); + pl_log_destroy(&log); + return 0; +} diff --git a/src/tests/string.c b/src/tests/string.c new file mode 100644 index 0000000..52985c4 --- /dev/null +++ b/src/tests/string.c @@ -0,0 +1,147 @@ +#include "tests.h" + +static const pl_str null = {0}; +static const pl_str test = PL_STR0("test"); +static const pl_str empty = PL_STR0(""); + +static inline bool is_null(pl_str str) +{ + return !str.len && !str.buf; +} + +static inline bool is_empty(pl_str str) +{ + return !str.len; +} + +int main() +{ + void *tmp = pl_tmp(NULL); + + REQUIRE(is_null(pl_str0(NULL))); + REQUIRE(is_null(pl_strdup(tmp, null))); + char *empty0 = pl_strdup0(tmp, null); + REQUIRE(empty0 && !empty0[0]); + REQUIRE(pl_str_equals0(empty, empty0)); + + pl_str buf = {0}; + pl_str_append(tmp, &buf, null); + REQUIRE(is_empty(buf)); + pl_str_append_asprintf(tmp, &buf, "%.*s", PL_STR_FMT(test)); + REQUIRE(pl_str_equals(buf, test)); + + pl_str_append_asprintf_c(tmp, &buf, "%d %f %f %f %lld %zu %.*sx %hx %hx %hx %hx", + 1, 1.0f, 4294967295.56, 83224965647295.65, 0xFFll, (size_t) 0, PL_STR_FMT(empty), + (unsigned short) 0xCAFEu, (unsigned short) 0x1, (unsigned short) 0, + (unsigned short) 0xFFFFu); + const char *expected = "test1 1 4294967295.56 83224965647295.66 255 0 x cafe 1 0 ffff"; + // fallback can produce different precision + const char *expected2 = "test1 1 4294967295.55999994277954102 83224965647295.65625 255 0 x cafe 1 0 ffff"; + REQUIRE(pl_str_equals0(buf, expected) || pl_str_equals0(buf, expected2)); + + REQUIRE_CMP(pl_strchr(null, ' '), <, 0, "d"); + REQUIRE_CMP((int) pl_strspn(null, " "), ==, 0, "d"); + REQUIRE_CMP((int) pl_strcspn(null, " "), ==, 0, "d"); + REQUIRE(is_null(pl_str_strip(null))); + + REQUIRE_CMP(pl_strchr(test, 's'), ==, 2, "d"); + REQUIRE_CMP((int) pl_strspn(test, "et"), ==, 2, "d"); + REQUIRE_CMP((int) pl_strcspn(test, "xs"), ==, 2, "d"); + + REQUIRE(is_null(pl_str_take(null, 10))); + REQUIRE(is_empty(pl_str_take(test, 0))); + REQUIRE(is_null(pl_str_drop(null, 10))); + REQUIRE(is_null(pl_str_drop(test, test.len))); + REQUIRE(pl_str_equals(pl_str_drop(test, 0), test)); + + REQUIRE_CMP(pl_str_find(null, test), <, 0, "d"); + REQUIRE_CMP(pl_str_find(null, null), ==, 0, "d"); + REQUIRE_CMP(pl_str_find(test, null), ==, 0, "d"); + REQUIRE_CMP(pl_str_find(test, test), ==, 0, "d"); + + pl_str rest; + REQUIRE(is_null(pl_str_split_char(null, ' ', &rest)) && is_null(rest)); + REQUIRE(is_null(pl_str_split_str(null, test, &rest)) && is_null(rest)); + REQUIRE(is_empty(pl_str_split_str(test, test, &rest)) && is_empty(rest)); + REQUIRE(is_null(pl_str_getline(null, &rest)) && is_null(rest)); + + pl_str right, left = pl_str_split_char(pl_str0("left right"), ' ', &right); + REQUIRE(pl_str_equals0(left, "left")); + REQUIRE(pl_str_equals0(right, "right")); + + left = pl_str_split_str0(pl_str0("leftTESTright"), "TEST", &right); + REQUIRE(pl_str_equals0(left, "left")); + REQUIRE(pl_str_equals0(right, "right")); + + pl_str out; + REQUIRE(pl_str_decode_hex(tmp, null, &out) && is_empty(out)); + REQUIRE(!pl_str_decode_hex(tmp, pl_str0("invalid"), &out)); + + REQUIRE(pl_str_equals(null, null)); + REQUIRE(pl_str_equals(null, empty)); + REQUIRE(pl_str_startswith(null, null)); + REQUIRE(pl_str_startswith(test, null)); + REQUIRE(pl_str_startswith(test, test)); + REQUIRE(pl_str_endswith(null, null)); + REQUIRE(pl_str_endswith(test, null)); + REQUIRE(pl_str_endswith(test, test)); + + double d; + float f; + int i; + unsigned u; + int64_t i64; + uint64_t u64; + + REQUIRE(pl_str_parse_double(pl_str0("4294967295.56"), &d)); REQUIRE_FEQ(d, 4294967295.56, 1e-20); + REQUIRE(pl_str_parse_double(pl_str0("-4294967295.56"), &d)); REQUIRE_FEQ(d, -4294967295.56, 1e-20); + REQUIRE(pl_str_parse_double(pl_str0("83224965647295.65"), &d)); REQUIRE_FEQ(d, 83224965647295.65, 1e-20); + REQUIRE(pl_str_parse_double(pl_str0("-83224965647295.65"), &d)); REQUIRE_FEQ(d, -83224965647295.65, 1e-20); + REQUIRE(pl_str_parse_float(pl_str0("4294967295.56"), &f)); REQUIRE_FEQ(f, 4294967295.56f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("-4294967295.56"), &f)); REQUIRE_FEQ(f, -4294967295.56f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("83224965647295.65"), &f)); REQUIRE_FEQ(f, 83224965647295.65f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("-83224965647295.65"), &f)); REQUIRE_FEQ(f, -83224965647295.65f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("1.3984"), &f)); REQUIRE_FEQ(f, 1.3984f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("-8.9100083"), &f)); REQUIRE_FEQ(f, -8.9100083f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("-0"), &f)); REQUIRE_FEQ(f, 0.0f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("-3.14e20"), &f)); REQUIRE_FEQ(f, -3.14e20f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("0.5e-5"), &f)); REQUIRE_FEQ(f, 0.5e-5f, 1e-8); + REQUIRE(pl_str_parse_float(pl_str0("0.5e+5"), &f)); REQUIRE_FEQ(f, 0.5e+5f, 1e-8); + REQUIRE(pl_str_parse_int(pl_str0("64239"), &i)); REQUIRE_CMP(i, ==, 64239, "d"); + REQUIRE(pl_str_parse_int(pl_str0("-102"), &i)); REQUIRE_CMP(i, ==, -102, "d"); + REQUIRE(pl_str_parse_int(pl_str0("1"), &i)); REQUIRE_CMP(i, ==, 1, "d"); + REQUIRE(pl_str_parse_int(pl_str0("-0"), &i)); REQUIRE_CMP(i, ==, 0, "d"); + REQUIRE(pl_str_parse_uint(pl_str0("64239"), &u)); REQUIRE_CMP(u, ==, 64239, "u"); + REQUIRE(pl_str_parse_uint(pl_str0("1"), &u)); REQUIRE_CMP(u, ==, 1, "u"); + REQUIRE(pl_str_parse_int64(pl_str0("9223372036854775799"), &i64)); + REQUIRE_CMP(i64, ==, 9223372036854775799LL, PRIi64); + REQUIRE(pl_str_parse_int64(pl_str0("-9223372036854775799"), &i64)); + REQUIRE_CMP(i64, ==, -9223372036854775799LL, PRIi64); + REQUIRE(pl_str_parse_uint64(pl_str0("18446744073709551609"), &u64)); + REQUIRE_CMP(u64, ==, 18446744073709551609LLU, PRIu64); + REQUIRE(!pl_str_parse_float(null, &f)); + REQUIRE(!pl_str_parse_float(test, &f)); + REQUIRE(!pl_str_parse_float(empty, &f)); + REQUIRE(!pl_str_parse_int(null, &i)); + REQUIRE(!pl_str_parse_int(test, &i)); + REQUIRE(!pl_str_parse_int(empty, &i)); + REQUIRE(!pl_str_parse_uint(null, &u)); + REQUIRE(!pl_str_parse_uint(test, &u)); + REQUIRE(!pl_str_parse_uint(empty, &u)); + + pl_str_builder builder = pl_str_builder_alloc(tmp); + pl_str_builder_const_str(builder, "hello"); + pl_str_builder_str(builder, pl_str0("world")); + pl_str res = pl_str_builder_exec(builder); + REQUIRE(pl_str_equals0(res, "helloworld")); + + pl_str_builder_reset(builder); + pl_str_builder_printf_c(builder, "foo %d bar %u bat %s baz %lld", + 123, 56u, "quack", 0xDEADBEEFll); + pl_str_builder_printf_c(builder, " %.*s", PL_STR_FMT(pl_str0("test123"))); + res = pl_str_builder_exec(builder); + REQUIRE(pl_str_equals0(res, "foo 123 bar 56 bat quack baz 3735928559 test123")); + + pl_free(tmp); + return 0; +} diff --git a/src/tests/tests.h b/src/tests/tests.h new file mode 100644 index 0000000..a33a0de --- /dev/null +++ b/src/tests/tests.h @@ -0,0 +1,319 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +#include <libplacebo/log.h> +#include <libplacebo/colorspace.h> +#include <libplacebo/shaders/film_grain.h> + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#ifdef PL_HAVE_WIN32 +#include <io.h> +#define isatty _isatty +#define fileno _fileno +#else +#include <unistd.h> +#endif + +static void pl_log_timestamp(void *stream, enum pl_log_level level, const char *msg) +{ + static char letter[] = { + [PL_LOG_FATAL] = 'f', + [PL_LOG_ERR] = 'e', + [PL_LOG_WARN] = 'w', + [PL_LOG_INFO] = 'i', + [PL_LOG_DEBUG] = 'd', + [PL_LOG_TRACE] = 't', + }; + + // Log time relative to the first message + static pl_clock_t base = 0; + if (!base) + base = pl_clock_now(); + + double secs = pl_clock_diff(pl_clock_now(), base); + printf("[%2.3f][%c] %s\n", secs, letter[level], msg); + + if (level <= PL_LOG_WARN) { + // duplicate warnings/errors to stderr + fprintf(stderr, "[%2.3f][%c] %s\n", secs, letter[level], msg); + fflush(stderr); + } +} + +static inline pl_log pl_test_logger(void) +{ + setbuf(stdout, NULL); + setbuf(stderr, NULL); + + return pl_log_create(PL_API_VER, pl_log_params( + .log_cb = isatty(fileno(stdout)) ? pl_log_color : pl_log_timestamp, + .log_level = PL_LOG_DEBUG, + )); +} + +#define RANDOM (rand() / (float) RAND_MAX) +#define RANDOM_U8 ((uint8_t) (256.0 * rand() / (RAND_MAX + 1.0))) +#define SKIP 77 + +// Helpers for performing various checks +#define REQUIRE(cond) do \ +{ \ + if (!(cond)) { \ + fprintf(stderr, "=== FAILED: '"#cond"' at "__FILE__":%d\n\n", __LINE__);\ + exit(1); \ + } \ +} while (0) + +#define REQUIRE_CMP(a, op, b, fmt) do \ +{ \ + __typeof__(a) _va = (a), _vb = (b); \ + \ + if (!(_va op _vb)) { \ + fprintf(stderr, "=== FAILED: '"#a" "#op" "#b"' at "__FILE__":%d\n" \ + " %-31s = %"fmt"\n" \ + " %-31s = %"fmt"\n\n", \ + __LINE__, #a, _va, #b, _vb); \ + exit(1); \ + } \ +} while (0) + +#define REQUIRE_FEQ(a, b, epsilon) do \ +{ \ + float _va = (a); \ + float _vb = (b); \ + float _delta = (epsilon) * fmax(1.0, fabs(_va)); \ + \ + if (fabs(_va - _vb) > _delta) { \ + fprintf(stderr, "=== FAILED: '"#a" ≈ "#b"' at "__FILE__":%d\n" \ + " %-31s = %f\n" \ + " %-31s = %f\n" \ + " %-31s = %f\n\n", \ + __LINE__, #a, _va, #b, _vb, \ + "epsilon "#epsilon" -> max delta", _delta); \ + exit(1); \ + } \ +} while (0) + +#define REQUIRE_STREQ(a, b) do \ +{ \ + const char *_a = (a); \ + const char *_b = (b); \ + if (strcmp(_a, _b) != 0) { \ + fprintf(stderr, "=== FAILED: !strcmp("#a", "#b") at "__FILE__":%d\n" \ + " %-31s = %s\n" \ + " %-31s = %s\n\n", \ + __LINE__, #a, _a, #b, _b); \ + exit(1); \ + } \ +} while (0) + +static inline void log_array(const uint8_t *a, const uint8_t *ref, size_t off, size_t size) +{ + for (size_t n = 0; n < size; n++) { + const char *prefix = "", *suffix = ""; + char terminator = ' '; + if (a[n + off] != ref[n + off]) { + prefix = "\033[31;1m"; + suffix = "\033[0m"; + } + if (n+1 == size || n % 16 == 15) + terminator = '\n'; + fprintf(stderr, "%s%02"PRIx8"%s%c", prefix, a[n + off], suffix, terminator); + } +} + +static inline void require_memeq(const void *aptr, const void *bptr, size_t size, + const char *astr, const char *bstr, + const char *sizestr, const char *file, int line) +{ + const uint8_t *a = aptr, *b = bptr; + for (size_t i = 0; i < size; i++) { + if (a[i] == b[i]) + continue; + + fprintf(stderr, "=== FAILED: memcmp(%s, %s, %s) == 0 at %s:%d\n" + "at position %zu: 0x%02"PRIx8" != 0x%02"PRIx8"\n\n", + astr, bstr, sizestr, file, line, i, a[i], b[i]); + + size_t start = i >= 256 ? i - 256 : 0; + size_t end = PL_MIN(size, i + 256); + fprintf(stderr, "%zu bytes of '%s' at offset %zu:\n", end - start, astr, start); + log_array(a, b, start, end - start); + fprintf(stderr, "\n%zu bytes of '%s' at offset %zu:\n", end - start, bstr, start); + log_array(b, a, start, end - start); + exit(1); + } +} + +#define REQUIRE_MEMEQ(a, b, size) require_memeq(a, b, size, #a, #b, #size, __FILE__, __LINE__) + +#define REQUIRE_HANDLE(shmem, type) \ + switch (type) { \ + case PL_HANDLE_FD: \ + case PL_HANDLE_DMA_BUF: \ + REQUIRE(shmem.handle.fd > -1); \ + break; \ + case PL_HANDLE_WIN32: \ + case PL_HANDLE_WIN32_KMT: \ + /* INVALID_HANDLE_VALUE = (-1) */ \ + REQUIRE(shmem.handle.handle != (void *)(intptr_t) (-1)); \ + /* fallthrough */ \ + case PL_HANDLE_MTL_TEX: \ + case PL_HANDLE_IOSURFACE: \ + REQUIRE(shmem.handle.handle); \ + break; \ + case PL_HANDLE_HOST_PTR: \ + REQUIRE(shmem.handle.ptr); \ + break; \ + } + +static const struct pl_av1_grain_data av1_grain_data = { + .num_points_y = 6, + .points_y = {{0, 4}, {27, 33}, {54, 55}, {67, 61}, {108, 71}, {255, 72}}, + .chroma_scaling_from_luma = false, + .num_points_uv = {2, 2}, + .points_uv = {{{0, 64}, {255, 64}}, {{0, 64}, {255, 64}}}, + .scaling_shift = 11, + .ar_coeff_lag = 3, + .ar_coeffs_y = {4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66}, + .ar_coeffs_uv = { + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127}, + }, + .ar_coeff_shift = 7, + .grain_scale_shift = 0, + .uv_mult = {0, 0}, + .uv_mult_luma = {64, 64}, + .uv_offset = {0, 0}, +}; + +static const uint8_t h274_lower_bound = 10; +static const uint8_t h274_upper_bound = 250; +static const int16_t h274_values[6] = {16, 12, 14}; + +static const struct pl_h274_grain_data h274_grain_data = { + .model_id = 0, + .blending_mode_id = 0, + .log2_scale_factor = 2, + .component_model_present = {true}, + .num_intensity_intervals = {1}, + .num_model_values = {3}, + .intensity_interval_lower_bound = {&h274_lower_bound}, + .intensity_interval_upper_bound = {&h274_upper_bound}, + .comp_model_value = {&h274_values}, +}; + +static const struct pl_dovi_metadata dovi_meta = { + .nonlinear = {{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}}, + .linear = {{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}}, + .comp = { + { + .num_pivots = 9, + .pivots = {0.0615835786, 0.129032254, 0.353861183, + 0.604105592, 0.854349971, 0.890518069, + 0.906158328, 0.913978517, 0.92082113}, + .method = {0, 0, 0, 0, 0, 0, 0, 0}, + .poly_coeffs = { + {-0.0488376617, 1.99335372, -2.41716385}, + {-0.0141925812, 1.61829138, -1.53397191}, + { 0.157061458, 0.63640213, -0.11302495}, + {0.25272119, 0.246226311, 0.27281332}, + {0.951621532, -1.35507894, 1.18898678}, + {6.41251612, -13.6188488, 8.07336903}, + {13.467535, -29.1869125, 16.6612244}, + {28.2321472, -61.8516273, 34.7264938} + }, + }, { + .num_pivots = 2, + .pivots = {0.0, 1.0}, + .method = {1}, + .mmr_order = {3}, + .mmr_constant = {-0.500733018}, + .mmr_coeffs = {{ + {1.08411026, 3.80807829, 0.0881733894, -3.23097038, -0.409078479, -1.31310081, 2.71297002}, + {-0.241833091, -3.57880807, -0.108109117, 3.13198471, 0.869203091, 1.96561158, -9.30871677}, + {-0.177356839, 1.48970401, 0.0908923149, -0.510447979, -0.687603354, -0.934977889, 12.3544884}, + }}, + }, { + .num_pivots = 2, + .pivots = {0.0, 1.0}, + .method = {1}, + .mmr_order = {3}, + .mmr_constant = {-1.23833287}, + .mmr_coeffs = {{ + {3.52909589, 0.383154511, 5.50820637, -1.02094889, -6.36386824, 0.194121242, 0.64683497}, + {-2.57899785, -0.626081586, -6.05729723, 2.29143763, 9.14653015, -0.0507702827, -4.17724133}, + {0.705404401, 0.341412306, 2.98387456, -1.71712542, -4.91501331, 0.1465137, 6.38665438}, + }}, + }, + }, +}; + +static const uint8_t sRGB_v2_nano_icc[] = { + 0x00, 0x00, 0x01, 0x9a, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00, + 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20, + 0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d, + 0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64, + 0xeb, 0x77, 0x1f, 0x3c, 0xaa, 0x53, 0x51, 0x02, 0xe9, 0x3e, 0x28, 0x6c, + 0x91, 0x46, 0xae, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5f, + 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x20, 0x00, 0x00, 0x00, 0x14, + 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x34, 0x00, 0x00, 0x00, 0x14, + 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x48, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34, + 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34, + 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34, + 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x90, 0x00, 0x00, 0x00, 0x0a, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, + 0x6e, 0x52, 0x47, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xf3, 0x54, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xc9, + 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0xa0, + 0x00, 0x00, 0x38, 0xf2, 0x00, 0x00, 0x03, 0x8f, 0x58, 0x59, 0x5a, 0x20, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x96, 0x00, 0x00, 0xb7, 0x89, + 0x00, 0x00, 0x18, 0xda, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x24, 0xa0, 0x00, 0x00, 0x0f, 0x85, 0x00, 0x00, 0xb6, 0xc4, + 0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x01, 0x07, 0x02, 0xb5, 0x05, 0x6b, 0x09, 0x36, 0x0e, 0x50, + 0x14, 0xb1, 0x1c, 0x80, 0x25, 0xc8, 0x30, 0xa1, 0x3d, 0x19, 0x4b, 0x40, + 0x5b, 0x27, 0x6c, 0xdb, 0x80, 0x6b, 0x95, 0xe3, 0xad, 0x50, 0xc6, 0xc2, + 0xe2, 0x31, 0xff, 0xff, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x00 +}; + +#define TEST_PROFILE(arr) ((struct pl_icc_profile) { \ + .data = (arr), \ + .len = PL_ARRAY_SIZE(arr), \ + .signature = (uintptr_t) (arr), \ +}) diff --git a/src/tests/tone_mapping.c b/src/tests/tone_mapping.c new file mode 100644 index 0000000..0a48945 --- /dev/null +++ b/src/tests/tone_mapping.c @@ -0,0 +1,181 @@ +#include "tests.h" +#include "log.h" + +#include <libplacebo/gamut_mapping.h> +#include <libplacebo/tone_mapping.h> + +//#define PRINT_LUTS + +int main() +{ + pl_log log = pl_test_logger(); + + // PQ unit tests + REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 0.0), 0.0, 1e-2); + REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 1.0), 10000.0, 1e-2); + REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 0.58), 203.0, 1e-2); + + // Test round-trip + for (float x = 0.0f; x < 1.0f; x += 0.01f) { + REQUIRE_FEQ(x, pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, + pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, x)), + 1e-5); + } + + static float lut[128]; + struct pl_tone_map_params params = { + .constants = { PL_TONE_MAP_CONSTANTS }, + .input_scaling = PL_HDR_PQ, + .output_scaling = PL_HDR_PQ, + .lut_size = PL_ARRAY_SIZE(lut), + }; + + // Test regular tone-mapping + params.input_min = pl_hdr_rescale(PL_HDR_NITS, params.input_scaling, 0.005); + params.input_max = pl_hdr_rescale(PL_HDR_NITS, params.input_scaling, 1000.0); + params.output_min = pl_hdr_rescale(PL_HDR_NORM, params.output_scaling, 0.001); + params.output_max = pl_hdr_rescale(PL_HDR_NORM, params.output_scaling, 1.0); + + struct pl_tone_map_params params_inv = params; + PL_SWAP(params_inv.input_min, params_inv.output_min); + PL_SWAP(params_inv.input_max, params_inv.output_max); + + int tested_pure_bpc = 0; + + // Generate example tone mapping curves, forward and inverse + for (int i = 0; i < pl_num_tone_map_functions; i++) { + const struct pl_tone_map_function *fun = pl_tone_map_functions[i]; + printf("Testing tone-mapping function %s\n", fun->name); + params.function = params_inv.function = fun; + pl_clock_t start = pl_clock_now(); + pl_tone_map_generate(lut, ¶ms); + pl_log_cpu_time(log, start, pl_clock_now(), "generating LUT"); + for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) { + REQUIRE(isfinite(lut[j]) && !isnan(lut[j])); + if (j > 0) + REQUIRE_CMP(lut[j], >=, lut[j - 1], "f"); +#ifdef PRINT_LUTS + printf("%f, %f\n", j / (PL_ARRAY_SIZE(lut) - 1.0f), lut[j]); +#endif + } + + if (fun->map_inverse || !tested_pure_bpc++) { + start = pl_clock_now(); + pl_tone_map_generate(lut, ¶ms_inv); + pl_log_cpu_time(log, start, pl_clock_now(), "generating inverse LUT"); + for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) { + REQUIRE(isfinite(lut[j]) && !isnan(lut[j])); + if (j > 0) + REQUIRE_CMP(lut[j], >=, lut[j - 1], "f"); +#ifdef PRINT_LUTS + printf("%f, %f\n", j / (PL_ARRAY_SIZE(lut) - 1.0f), lut[j]); +#endif + } + } + } + + // Test that `spline` is a no-op for 1:1 tone mapping + params.output_min = params.input_min; + params.output_max = params.input_max; + params.function = &pl_tone_map_spline; + pl_tone_map_generate(lut, ¶ms); + for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) { + float x = j / (PL_ARRAY_SIZE(lut) - 1.0f); + x = PL_MIX(params.input_min, params.input_max, x); + REQUIRE_FEQ(x, lut[j], 1e-5); + } + + // Test some gamut mapping methods + for (int i = 0; i < pl_num_gamut_map_functions; i++) { + static const float min_rgb = 0.1f, max_rgb = PL_COLOR_SDR_WHITE; + struct pl_gamut_map_params gamut = { + .function = pl_gamut_map_functions[i], + .input_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020), + .output_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709), + .min_luma = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, min_rgb), + .max_luma = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, max_rgb), + }; + + printf("Testing gamut-mapping function %s\n", gamut.function->name); + + // Require that black maps to black and white maps to white + float black[3] = { gamut.min_luma, 0.0f, 0.0f }; + float white[3] = { gamut.max_luma, 0.0f, 0.0f }; + pl_gamut_map_sample(black, &gamut); + pl_gamut_map_sample(white, &gamut); + REQUIRE_FEQ(black[0], gamut.min_luma, 1e-4); + REQUIRE_FEQ(black[1], 0.0f, 1e-4); + REQUIRE_FEQ(black[2], 0.0f, 1e-4); + if (gamut.function != &pl_gamut_map_darken) + REQUIRE_FEQ(white[0], gamut.max_luma, 1e-4); + REQUIRE_FEQ(white[1], 0.0f, 1e-4); + REQUIRE_FEQ(white[2], 0.0f, 1e-4); + } + + enum { LUT3D_SIZE = 65 }; // for benchmarking + struct pl_gamut_map_params perceptual = { + .function = &pl_gamut_map_perceptual, + .input_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020), + .output_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709), + .max_luma = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, 1.0f), + .lut_size_I = LUT3D_SIZE, + .lut_size_C = LUT3D_SIZE, + .lut_size_h = LUT3D_SIZE, + .lut_stride = 3, + + // Set strength to maximum, because otherwise the saturation mapping + // code will not fully apply, invalidating the following test + .constants.perceptual_strength = 1.0f, + }; + + // Test that primaries round-trip for perceptual gamut mapping + const pl_matrix3x3 rgb2lms_src = pl_ipt_rgb2lms(&perceptual.input_gamut); + const pl_matrix3x3 rgb2lms_dst = pl_ipt_rgb2lms(&perceptual.output_gamut); + const pl_matrix3x3 lms2rgb_dst = pl_ipt_lms2rgb(&perceptual.output_gamut); + static const float refpoints[][3] = { + {1, 0, 0}, {0, 1, 0}, {0, 0, 1}, + {0, 1, 1}, {1, 0, 1}, {1, 1, 0}, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(refpoints); i++) { + float c[3] = { refpoints[i][0], refpoints[i][1], refpoints[i][2] }; + float ref[3] = { refpoints[i][0], refpoints[i][1], refpoints[i][2] }; + printf("Testing primary: RGB {%.0f %.0f %.0f}\n", c[0], c[1], c[2]); + pl_matrix3x3_apply(&rgb2lms_src, c); + c[0] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[0]); + c[1] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[1]); + c[2] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[2]); + pl_matrix3x3_apply(&pl_ipt_lms2ipt, c); + printf("Before: ICh {%f %f %f}\n", + c[0], sqrtf(c[1]*c[1] + c[2]*c[2]), atan2f(c[2], c[1])); + pl_gamut_map_sample(c, &perceptual); + float rgb[3] = { c[0], c[1], c[2] }; + pl_matrix3x3_apply(&pl_ipt_ipt2lms, rgb); + rgb[0] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[0]); + rgb[1] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[1]); + rgb[2] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[2]); + pl_matrix3x3_apply(&lms2rgb_dst, rgb); + const float hue = atan2f(c[2], c[1]); + printf("After: ICh {%f %f %f} = RGB {%f %f %f}\n", + c[0], sqrtf(c[1]*c[1] + c[2]*c[2]), hue, rgb[0], rgb[1], rgb[2]); + pl_matrix3x3_apply(&rgb2lms_dst, ref); + ref[0] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[0]); + ref[1] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[1]); + ref[2] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[2]); + pl_matrix3x3_apply(&pl_ipt_lms2ipt, ref); + const float hue_ref = atan2f(ref[2], ref[1]); + printf("Should be: ICh {%f %f %f}\n", + ref[0], sqrtf(ref[1]*ref[1] + ref[2]*ref[2]), hue_ref); + REQUIRE_FEQ(hue, hue_ref, 3.0e-3); + } + + float *tmp = malloc(sizeof(float[LUT3D_SIZE][LUT3D_SIZE][LUT3D_SIZE][3])); + if (tmp) { + pl_clock_t start = pl_clock_now(); + pl_gamut_map_generate(tmp, &perceptual); + pl_log_cpu_time(log, start, pl_clock_now(), "generating 3DLUT"); + free(tmp); + } + + pl_log_destroy(&log); +} diff --git a/src/tests/utils.c b/src/tests/utils.c new file mode 100644 index 0000000..73a9265 --- /dev/null +++ b/src/tests/utils.c @@ -0,0 +1,165 @@ +#include "tests.h" +#include "gpu.h" + +#include <libplacebo/utils/upload.h> + +int main() +{ + struct pl_bit_encoding bits = {0}; + struct pl_plane_data data = {0}; + + static const struct pl_bit_encoding bits0 = {0}; + static const struct pl_bit_encoding bits8 = { + .sample_depth = 8, + .color_depth = 8, + }; + + static const struct pl_bit_encoding bits16 = { + .sample_depth = 16, + .color_depth = 16, + }; + + static const struct pl_bit_encoding bits10_16 = { + .sample_depth = 16, + .color_depth = 10, + }; + + static const struct pl_bit_encoding bits10_16_6 = { + .sample_depth = 16, + .color_depth = 10, + .bit_shift = 6, + }; + +#define TEST_ALIGN(ref, ref_align, ref_bits, ...) \ + do { \ + pl_plane_data_from_mask(&data, (uint64_t[4]){ __VA_ARGS__ }); \ + REQUIRE_MEMEQ(&data, &ref, sizeof(ref)); \ + pl_plane_data_align(&data, &bits); \ + REQUIRE_MEMEQ(&data, &ref_align, sizeof(ref_align)); \ + REQUIRE_MEMEQ(&bits, &ref_bits, sizeof(bits)); \ + } while (0) + +#define TEST(ref, bits, ...) TEST_ALIGN(ref, ref, bits, __VA_ARGS__) + + static const struct pl_plane_data rgb8 = { + .component_size = {8, 8, 8}, + .component_map = {0, 1, 2}, + }; + + TEST(rgb8, bits8, 0xFF, 0xFF00, 0xFF0000); + + static const struct pl_plane_data bgra8 = { + .component_size = {8, 8, 8, 8}, + .component_map = {2, 1, 0, 3}, + }; + + TEST(bgra8, bits8, 0xFF0000, 0xFF00, 0xFF, 0xFF000000); + + static const struct pl_plane_data gr16 = { + .component_size = {16, 16}, + .component_map = {1, 0}, + }; + + TEST(gr16, bits16, 0xFFFF0000, 0xFFFF); + + static const struct pl_plane_data r10x6g10 = { + .component_size = {10, 10}, + .component_map = {1, 0}, // LSB -> MSB ordering + .component_pad = {0, 6}, + }; + + TEST_ALIGN(r10x6g10, gr16, bits10_16, 0x03FF0000, 0x03FF); + + static const struct pl_plane_data rgb565 = { + .component_size = {5, 6, 5}, + .component_map = {2, 1, 0}, // LSB -> MSB ordering + }; + + TEST(rgb565, bits0, 0xF800, 0x07E0, 0x001F); + + static const struct pl_plane_data rgba16 = { + .component_size = {16, 16, 16, 16}, + .component_map = {0, 1, 2, 3}, + }; + + TEST(rgba16, bits16, 0xFFFFllu, 0xFFFF0000llu, 0xFFFF00000000llu, 0xFFFF000000000000llu); + + static const struct pl_plane_data p010 = { + .component_size = {10, 10, 10}, + .component_map = {0, 1, 2}, + .component_pad = {6, 6, 6}, + }; + + static const struct pl_plane_data rgb16 = { + .component_size = {16, 16, 16}, + .component_map = {0, 1, 2}, + }; + + TEST_ALIGN(p010, rgb16, bits10_16_6, 0xFFC0llu, 0xFFC00000llu, 0xFFC000000000llu); + + // Test GLSL structure packing + struct pl_var vec1 = pl_var_float(""), + vec2 = pl_var_vec2(""), + vec3 = pl_var_vec3(""), + mat2 = pl_var_mat2(""), + mat3 = pl_var_mat3(""); + + struct pl_var_layout layout; + layout = pl_std140_layout(0, &vec2); + REQUIRE_CMP(layout.offset, ==, 0 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 2 * sizeof(float), "zu"); + + layout = pl_std140_layout(3 * sizeof(float), &vec3); + REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 3 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 3 * sizeof(float), "zu"); + + layout = pl_std140_layout(2 * sizeof(float), &mat3); + REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 3 * 4 * sizeof(float), "zu"); + + layout = pl_std430_layout(2 * sizeof(float), &mat3); + REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 4 * 3 * sizeof(float), "zu"); + + layout = pl_std140_layout(3 * sizeof(float), &vec1); + REQUIRE_CMP(layout.offset, ==, 3 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, sizeof(float), "zu"); + + struct pl_var vec2a = vec2; + vec2a.dim_a = 50; + + layout = pl_std140_layout(sizeof(float), &vec2a); + REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 50 * 4 * sizeof(float), "zu"); + + layout = pl_std430_layout(sizeof(float), &vec2a); + REQUIRE_CMP(layout.offset, ==, 2 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 50 * 2 * sizeof(float), "zu"); + + struct pl_var mat2a = mat2; + mat2a.dim_a = 20; + + layout = pl_std140_layout(5 * sizeof(float), &mat2a); + REQUIRE_CMP(layout.offset, ==, 8 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 20 * 2 * 4 * sizeof(float), "zu"); + + layout = pl_std430_layout(5 * sizeof(float), &mat2a); + REQUIRE_CMP(layout.offset, ==, 6 * sizeof(float), "zu"); + REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu"); + REQUIRE_CMP(layout.size, ==, 20 * 2 * 2 * sizeof(float), "zu"); + + for (const struct pl_named_var *nvar = pl_var_glsl_types; nvar->glsl_name; nvar++) { + struct pl_var var = nvar->var; + REQUIRE_CMP(nvar->glsl_name, ==, pl_var_glsl_type_name(var), "s"); + var.dim_a = 100; + REQUIRE_CMP(nvar->glsl_name, ==, pl_var_glsl_type_name(var), "s"); + } +} diff --git a/src/tests/vulkan.c b/src/tests/vulkan.c new file mode 100644 index 0000000..476560a --- /dev/null +++ b/src/tests/vulkan.c @@ -0,0 +1,296 @@ +#include <vulkan/vulkan.h> + +#include "gpu_tests.h" +#include "vulkan/command.h" +#include "vulkan/gpu.h" + +#include <libplacebo/vulkan.h> + +static void vulkan_interop_tests(pl_vulkan pl_vk, + enum pl_handle_type handle_type) +{ + pl_gpu gpu = pl_vk->gpu; + printf("testing vulkan interop for handle type 0x%x\n", handle_type); + + if (gpu->export_caps.buf & handle_type) { + pl_buf buf = pl_buf_create(gpu, pl_buf_params( + .size = 1024, + .export_handle = handle_type, + )); + + REQUIRE(buf); + REQUIRE_HANDLE(buf->shared_mem, handle_type); + REQUIRE_CMP(buf->shared_mem.size, >=, buf->params.size, "zu"); + REQUIRE(pl_buf_export(gpu, buf)); + pl_buf_destroy(gpu, &buf); + } + + pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 1, 0, 0, PL_FMT_CAP_BLITTABLE); + if (!fmt) + return; + + if (gpu->export_caps.sync & handle_type) { + pl_sync sync = pl_sync_create(gpu, handle_type); + pl_tex tex = pl_tex_create(gpu, pl_tex_params( + .w = 32, + .h = 32, + .format = fmt, + .blit_dst = true, + )); + + REQUIRE(sync); + REQUIRE(tex); + + // Note: For testing purposes, we have to fool pl_tex_export into + // thinking this texture is actually exportable. Just hack it in + // horribly. + ((struct pl_tex_params *) &tex->params)->export_handle = PL_HANDLE_DMA_BUF; + + REQUIRE(pl_tex_export(gpu, tex, sync)); + + // Re-use our internal helpers to signal this VkSemaphore + struct vk_ctx *vk = PL_PRIV(pl_vk); + struct vk_cmd *cmd = vk_cmd_begin(vk->pool_graphics, NULL); + REQUIRE(cmd); + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_NONE, (pl_vulkan_sem){ sync_vk->signal }); + REQUIRE(vk_cmd_submit(&cmd)); + + // Do something with the image again to "import" it + pl_tex_clear(gpu, tex, (float[4]){0}); + pl_gpu_finish(gpu); + REQUIRE(!pl_tex_poll(gpu, tex, 0)); + + pl_sync_destroy(gpu, &sync); + pl_tex_destroy(gpu, &tex); + } + + // Test interop API + if (gpu->export_caps.tex & handle_type) { + VkSemaphore sem = pl_vulkan_sem_create(gpu, pl_vulkan_sem_params( + .type = VK_SEMAPHORE_TYPE_TIMELINE, + .initial_value = 0, + )); + + pl_tex tex = pl_tex_create(gpu, pl_tex_params( + .w = 32, + .h = 32, + .format = fmt, + .blit_dst = true, + .export_handle = handle_type, + )); + + REQUIRE(sem); + REQUIRE(tex); + + REQUIRE(pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = tex, + .layout = VK_IMAGE_LAYOUT_GENERAL, + .qf = VK_QUEUE_FAMILY_EXTERNAL, + .semaphore = { sem, 1 }, + ))); + + pl_vulkan_release_ex(gpu, pl_vulkan_release_params( + .tex = tex, + .layout = VK_IMAGE_LAYOUT_GENERAL, + .qf = VK_QUEUE_FAMILY_EXTERNAL, + .semaphore = { sem, 1 }, + )); + + pl_tex_clear(gpu, tex, (float[4]){0}); + pl_gpu_finish(gpu); + REQUIRE(!pl_tex_poll(gpu, tex, 0)); + + pl_vulkan_sem_destroy(gpu, &sem); + pl_tex_destroy(gpu, &tex); + } +} + +static void vulkan_swapchain_tests(pl_vulkan vk, VkSurfaceKHR surf) +{ + if (!surf) + return; + + printf("testing vulkan swapchain\n"); + pl_gpu gpu = vk->gpu; + pl_swapchain sw; + sw = pl_vulkan_create_swapchain(vk, pl_vulkan_swapchain_params( + .surface = surf, + )); + REQUIRE(sw); + + // Attempt actually initializing the swapchain + int w = 640, h = 480; + REQUIRE(pl_swapchain_resize(sw, &w, &h)); + + for (int i = 0; i < 10; i++) { + struct pl_swapchain_frame frame; + REQUIRE(pl_swapchain_start_frame(sw, &frame)); + if (frame.fbo->params.blit_dst) + pl_tex_clear(gpu, frame.fbo, (float[4]){0}); + + // TODO: test this with an actual pl_renderer instance + struct pl_frame target; + pl_frame_from_swapchain(&target, &frame); + + REQUIRE(pl_swapchain_submit_frame(sw)); + pl_swapchain_swap_buffers(sw); + + // Try resizing the swapchain in the middle of rendering + if (i == 5) { + w = 320; + h = 240; + REQUIRE(pl_swapchain_resize(sw, &w, &h)); + } + } + + pl_swapchain_destroy(&sw); +} + +int main() +{ + pl_log log = pl_test_logger(); + pl_vk_inst inst = pl_vk_inst_create(log, pl_vk_inst_params( + .debug = true, + .debug_extra = true, + .get_proc_addr = vkGetInstanceProcAddr, + .opt_extensions = (const char *[]){ + VK_KHR_SURFACE_EXTENSION_NAME, + VK_EXT_HEADLESS_SURFACE_EXTENSION_NAME, + }, + .num_opt_extensions = 2, + )); + + if (!inst) + return SKIP; + + PL_VK_LOAD_FUN(inst->instance, EnumeratePhysicalDevices, inst->get_proc_addr); + PL_VK_LOAD_FUN(inst->instance, GetPhysicalDeviceProperties, inst->get_proc_addr); + + uint32_t num = 0; + EnumeratePhysicalDevices(inst->instance, &num, NULL); + if (!num) + return SKIP; + + VkPhysicalDevice *devices = calloc(num, sizeof(*devices)); + if (!devices) + return 1; + EnumeratePhysicalDevices(inst->instance, &num, devices); + + VkSurfaceKHR surf = VK_NULL_HANDLE; + + PL_VK_LOAD_FUN(inst->instance, CreateHeadlessSurfaceEXT, inst->get_proc_addr); + if (CreateHeadlessSurfaceEXT) { + VkHeadlessSurfaceCreateInfoEXT info = { + .sType = VK_STRUCTURE_TYPE_HEADLESS_SURFACE_CREATE_INFO_EXT, + }; + + VkResult res = CreateHeadlessSurfaceEXT(inst->instance, &info, NULL, &surf); + REQUIRE_CMP(res, ==, VK_SUCCESS, "u"); + } + + // Make sure choosing any device works + VkPhysicalDevice dev; + dev = pl_vulkan_choose_device(log, pl_vulkan_device_params( + .instance = inst->instance, + .get_proc_addr = inst->get_proc_addr, + .allow_software = true, + .surface = surf, + )); + if (!dev) + return SKIP; + + // Test all attached devices + for (int i = 0; i < num; i++) { + VkPhysicalDeviceProperties props = {0}; + GetPhysicalDeviceProperties(devices[i], &props); +#ifndef CI_ALLOW_SW + if (props.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU) { + printf("Skipping device %d: %s\n", i, props.deviceName); + continue; + } +#endif + printf("Testing device %d: %s\n", i, props.deviceName); + + // Make sure we can choose this device by name + dev = pl_vulkan_choose_device(log, pl_vulkan_device_params( + .instance = inst->instance, + .get_proc_addr = inst->get_proc_addr, + .device_name = props.deviceName, + )); + REQUIRE_CMP(dev, ==, devices[i], "p"); + + struct pl_vulkan_params params = *pl_vulkan_params( + .instance = inst->instance, + .get_proc_addr = inst->get_proc_addr, + .device = devices[i], + .queue_count = 8, // test inter-queue stuff + .surface = surf, + ); + + pl_vulkan vk = pl_vulkan_create(log, ¶ms); + if (!vk) + continue; + + gpu_shader_tests(vk->gpu); + vulkan_swapchain_tests(vk, surf); + + // Print heap statistics + pl_vk_print_heap(vk->gpu, PL_LOG_DEBUG); + + // Test importing this context via the vulkan interop API + pl_vulkan vk2 = pl_vulkan_import(log, pl_vulkan_import_params( + .instance = vk->instance, + .get_proc_addr = inst->get_proc_addr, + .phys_device = vk->phys_device, + .device = vk->device, + + .extensions = vk->extensions, + .num_extensions = vk->num_extensions, + .features = vk->features, + .queue_graphics = vk->queue_graphics, + .queue_compute = vk->queue_compute, + .queue_transfer = vk->queue_transfer, + )); + REQUIRE(vk2); + pl_vulkan_destroy(&vk2); + + // Run these tests last because they disable some validation layers +#ifdef PL_HAVE_UNIX + vulkan_interop_tests(vk, PL_HANDLE_FD); + vulkan_interop_tests(vk, PL_HANDLE_DMA_BUF); +#endif +#ifdef PL_HAVE_WIN32 + vulkan_interop_tests(vk, PL_HANDLE_WIN32); + vulkan_interop_tests(vk, PL_HANDLE_WIN32_KMT); +#endif + gpu_interop_tests(vk->gpu); + pl_vulkan_destroy(&vk); + + // Re-run the same export/import tests with async queues disabled + params.async_compute = false; + params.async_transfer = false; + vk = pl_vulkan_create(log, ¶ms); + REQUIRE(vk); // it succeeded the first time + +#ifdef PL_HAVE_UNIX + vulkan_interop_tests(vk, PL_HANDLE_FD); + vulkan_interop_tests(vk, PL_HANDLE_DMA_BUF); +#endif +#ifdef PL_HAVE_WIN32 + vulkan_interop_tests(vk, PL_HANDLE_WIN32); + vulkan_interop_tests(vk, PL_HANDLE_WIN32_KMT); +#endif + gpu_interop_tests(vk->gpu); + pl_vulkan_destroy(&vk); + + // Reduce log spam after first tested device + pl_log_level_update(log, PL_LOG_INFO); + } + + if (surf) + vkDestroySurfaceKHR(inst->instance, surf, NULL); + pl_vk_inst_destroy(&inst); + pl_log_destroy(&log); + free(devices); +} diff --git a/src/tone_mapping.c b/src/tone_mapping.c new file mode 100644 index 0000000..f08bb58 --- /dev/null +++ b/src/tone_mapping.c @@ -0,0 +1,775 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "common.h" + +#include <libplacebo/tone_mapping.h> + +#define fclampf(x, lo, hi) fminf(fmaxf(x, lo), hi) +static void fix_constants(struct pl_tone_map_constants *c) +{ + const float eps = 1e-6f; + c->knee_adaptation = fclampf(c->knee_adaptation, 0.0f, 1.0f); + c->knee_minimum = fclampf(c->knee_minimum, eps, 0.5f - eps); + c->knee_maximum = fclampf(c->knee_maximum, 0.5f + eps, 1.0f - eps); + c->knee_default = fclampf(c->knee_default, c->knee_minimum, c->knee_maximum); + c->knee_offset = fclampf(c->knee_offset, 0.5f, 2.0f); + c->slope_tuning = fclampf(c->slope_tuning, 0.0f, 10.0f); + c->slope_offset = fclampf(c->slope_offset, 0.0f, 1.0f); + c->spline_contrast = fclampf(c->spline_contrast, 0.0f, 1.5f); + c->reinhard_contrast = fclampf(c->reinhard_contrast, eps, 1.0f - eps); + c->linear_knee = fclampf(c->linear_knee, eps, 1.0f - eps); + c->exposure = fclampf(c->exposure, eps, 10.0f); +} + +static inline bool constants_equal(const struct pl_tone_map_constants *a, + const struct pl_tone_map_constants *b) +{ + pl_static_assert(sizeof(*a) % sizeof(float) == 0); + return !memcmp(a, b, sizeof(*a)); +} + +bool pl_tone_map_params_equal(const struct pl_tone_map_params *a, + const struct pl_tone_map_params *b) +{ + return a->function == b->function && + a->param == b->param && + a->input_scaling == b->input_scaling && + a->output_scaling == b->output_scaling && + a->lut_size == b->lut_size && + a->input_min == b->input_min && + a->input_max == b->input_max && + a->input_avg == b->input_avg && + a->output_min == b->output_min && + a->output_max == b->output_max && + constants_equal(&a->constants, &b->constants) && + pl_hdr_metadata_equal(&a->hdr, &b->hdr); +} + +bool pl_tone_map_params_noop(const struct pl_tone_map_params *p) +{ + float in_min = pl_hdr_rescale(p->input_scaling, PL_HDR_NITS, p->input_min); + float in_max = pl_hdr_rescale(p->input_scaling, PL_HDR_NITS, p->input_max); + float out_min = pl_hdr_rescale(p->output_scaling, PL_HDR_NITS, p->output_min); + float out_max = pl_hdr_rescale(p->output_scaling, PL_HDR_NITS, p->output_max); + bool can_inverse = p->function->map_inverse; + + return fabs(in_min - out_min) < 1e-4 && // no BPC + in_max < out_max + 1e-2 && // no range reduction + (out_max < in_max + 1e-2 || !can_inverse); // no inverse tone-mapping +} + +void pl_tone_map_params_infer(struct pl_tone_map_params *par) +{ + if (!par->function) + par->function = &pl_tone_map_clip; + + if (par->param) { + // Backwards compatibility for older API + if (par->function == &pl_tone_map_st2094_40 || par->function == &pl_tone_map_st2094_10) + par->constants.knee_adaptation = par->param; + if (par->function == &pl_tone_map_bt2390) + par->constants.knee_offset = par->param; + if (par->function == &pl_tone_map_spline) + par->constants.spline_contrast = par->param; + if (par->function == &pl_tone_map_reinhard) + par->constants.reinhard_contrast = par->param; + if (par->function == &pl_tone_map_mobius || par->function == &pl_tone_map_gamma) + par->constants.linear_knee = par->param; + if (par->function == &pl_tone_map_linear || par->function == &pl_tone_map_linear_light) + par->constants.exposure = par->param; + } + + fix_constants(&par->constants); + + // Constrain the input peak to be no less than target SDR white + float sdr = pl_hdr_rescale(par->output_scaling, par->input_scaling, par->output_max); + sdr = fminf(sdr, pl_hdr_rescale(PL_HDR_NITS, par->input_scaling, PL_COLOR_SDR_WHITE)); + par->input_max = fmaxf(par->input_max, sdr); + + // Constrain the output peak if function does not support inverse mapping + if (!par->function->map_inverse) + par->output_max = fminf(par->output_max, par->input_max); +} + +// Infer params and rescale to function scaling +static struct pl_tone_map_params fix_params(const struct pl_tone_map_params *params) +{ + struct pl_tone_map_params fixed = *params; + pl_tone_map_params_infer(&fixed); + + const struct pl_tone_map_function *fun = params->function; + fixed.input_scaling = fun->scaling; + fixed.output_scaling = fun->scaling; + fixed.input_min = pl_hdr_rescale(params->input_scaling, fun->scaling, fixed.input_min); + fixed.input_max = pl_hdr_rescale(params->input_scaling, fun->scaling, fixed.input_max); + fixed.input_avg = pl_hdr_rescale(params->input_scaling, fun->scaling, fixed.input_avg); + fixed.output_min = pl_hdr_rescale(params->output_scaling, fun->scaling, fixed.output_min); + fixed.output_max = pl_hdr_rescale(params->output_scaling, fun->scaling, fixed.output_max); + + return fixed; +} + +#define FOREACH_LUT(lut, V) \ + for (float *_iter = lut, *_end = lut + params->lut_size, V; \ + _iter < _end && ( V = *_iter, 1 ); *_iter++ = V) + +static void map_lut(float *lut, const struct pl_tone_map_params *params) +{ + if (params->output_max > params->input_max + 1e-4) { + // Inverse tone-mapping + pl_assert(params->function->map_inverse); + params->function->map_inverse(lut, params); + } else { + // Forward tone-mapping + params->function->map(lut, params); + } +} + +void pl_tone_map_generate(float *out, const struct pl_tone_map_params *params) +{ + struct pl_tone_map_params fixed = fix_params(params); + + // Generate input values evenly spaced in `params->input_scaling` + for (size_t i = 0; i < params->lut_size; i++) { + float x = (float) i / (params->lut_size - 1); + x = PL_MIX(params->input_min, params->input_max, x); + out[i] = pl_hdr_rescale(params->input_scaling, fixed.function->scaling, x); + } + + map_lut(out, &fixed); + + // Sanitize outputs and adapt back to `params->scaling` + for (size_t i = 0; i < params->lut_size; i++) { + float x = PL_CLAMP(out[i], fixed.output_min, fixed.output_max); + out[i] = pl_hdr_rescale(fixed.function->scaling, params->output_scaling, x); + } +} + +float pl_tone_map_sample(float x, const struct pl_tone_map_params *params) +{ + struct pl_tone_map_params fixed = fix_params(params); + fixed.lut_size = 1; + + x = PL_CLAMP(x, params->input_min, params->input_max); + x = pl_hdr_rescale(params->input_scaling, fixed.function->scaling, x); + map_lut(&x, &fixed); + x = PL_CLAMP(x, fixed.output_min, fixed.output_max); + x = pl_hdr_rescale(fixed.function->scaling, params->output_scaling, x); + return x; +} + +// Rescale from input-absolute to input-relative +static inline float rescale_in(float x, const struct pl_tone_map_params *params) +{ + return (x - params->input_min) / (params->input_max - params->input_min); +} + +// Rescale from input-absolute to output-relative +static inline float rescale(float x, const struct pl_tone_map_params *params) +{ + return (x - params->input_min) / (params->output_max - params->output_min); +} + +// Rescale from output-relative to output-absolute +static inline float rescale_out(float x, const struct pl_tone_map_params *params) +{ + return x * (params->output_max - params->output_min) + params->output_min; +} + +static inline float bt1886_eotf(float x, float min, float max) +{ + const float lb = powf(min, 1/2.4f); + const float lw = powf(max, 1/2.4f); + return powf((lw - lb) * x + lb, 2.4f); +} + +static inline float bt1886_oetf(float x, float min, float max) +{ + const float lb = powf(min, 1/2.4f); + const float lw = powf(max, 1/2.4f); + return (powf(x, 1/2.4f) - lb) / (lw - lb); +} + +static void noop(float *lut, const struct pl_tone_map_params *params) +{ + return; +} + +const struct pl_tone_map_function pl_tone_map_clip = { + .name = "clip", + .description = "No tone mapping (clip)", + .map = noop, + .map_inverse = noop, +}; + +// Helper function to pick a knee point (for suitable methods) based on the +// HDR10+ brightness metadata and scene brightness average matching. +// +// Inspired by SMPTE ST2094-10, with some modifications +static void st2094_pick_knee(float *out_src_knee, float *out_dst_knee, + const struct pl_tone_map_params *params) +{ + const float src_min = pl_hdr_rescale(params->input_scaling, PL_HDR_PQ, params->input_min); + const float src_max = pl_hdr_rescale(params->input_scaling, PL_HDR_PQ, params->input_max); + const float src_avg = pl_hdr_rescale(params->input_scaling, PL_HDR_PQ, params->input_avg); + const float dst_min = pl_hdr_rescale(params->output_scaling, PL_HDR_PQ, params->output_min); + const float dst_max = pl_hdr_rescale(params->output_scaling, PL_HDR_PQ, params->output_max); + + const float min_knee = params->constants.knee_minimum; + const float max_knee = params->constants.knee_maximum; + const float def_knee = params->constants.knee_default; + const float src_knee_min = PL_MIX(src_min, src_max, min_knee); + const float src_knee_max = PL_MIX(src_min, src_max, max_knee); + const float dst_knee_min = PL_MIX(dst_min, dst_max, min_knee); + const float dst_knee_max = PL_MIX(dst_min, dst_max, max_knee); + + // Choose source knee based on source scene brightness + float src_knee = PL_DEF(src_avg, PL_MIX(src_min, src_max, def_knee)); + src_knee = fclampf(src_knee, src_knee_min, src_knee_max); + + // Choose target adaptation point based on linearly re-scaling source knee + float target = (src_knee - src_min) / (src_max - src_min); + float adapted = PL_MIX(dst_min, dst_max, target); + + // Choose the destnation knee by picking the perceptual adaptation point + // between the source knee and the desired target. This moves the knee + // point, on the vertical axis, closer to the 1:1 (neutral) line. + // + // Adjust the adaptation strength towards 1 based on how close the knee + // point is to its extreme values (min/max knee) + float tuning = 1.0f - pl_smoothstep(max_knee, def_knee, target) * + pl_smoothstep(min_knee, def_knee, target); + float adaptation = PL_MIX(params->constants.knee_adaptation, 1.0f, tuning); + float dst_knee = PL_MIX(src_knee, adapted, adaptation); + dst_knee = fclampf(dst_knee, dst_knee_min, dst_knee_max); + + *out_src_knee = pl_hdr_rescale(PL_HDR_PQ, params->input_scaling, src_knee); + *out_dst_knee = pl_hdr_rescale(PL_HDR_PQ, params->output_scaling, dst_knee); +} + +// Pascal's triangle +static const uint16_t binom[17][17] = { + {1}, + {1,1}, + {1,2,1}, + {1,3,3,1}, + {1,4,6,4,1}, + {1,5,10,10,5,1}, + {1,6,15,20,15,6,1}, + {1,7,21,35,35,21,7,1}, + {1,8,28,56,70,56,28,8,1}, + {1,9,36,84,126,126,84,36,9,1}, + {1,10,45,120,210,252,210,120,45,10,1}, + {1,11,55,165,330,462,462,330,165,55,11,1}, + {1,12,66,220,495,792,924,792,495,220,66,12,1}, + {1,13,78,286,715,1287,1716,1716,1287,715,286,78,13,1}, + {1,14,91,364,1001,2002,3003,3432,3003,2002,1001,364,91,14,1}, + {1,15,105,455,1365,3003,5005,6435,6435,5005,3003,1365,455,105,15,1}, + {1,16,120,560,1820,4368,8008,11440,12870,11440,8008,4368,1820,560,120,16,1}, +}; + +static inline float st2094_intercept(uint8_t N, float Kx, float Ky) +{ + if (Kx <= 0 || Ky >= 1) + return 1.0f / N; + + const float slope = Ky / Kx * (1 - Kx) / (1 - Ky); + return fminf(slope / N, 1.0f); +} + +static void st2094_40(float *lut, const struct pl_tone_map_params *params) +{ + const float D = params->output_max; + + // Allocate space for the adjusted bezier control points, plus endpoints + float P[17], Kx, Ky, T; + uint8_t N; + + if (params->hdr.ootf.num_anchors) { + + // Use bezier curve from metadata + Kx = PL_CLAMP(params->hdr.ootf.knee_x, 0, 1); + Ky = PL_CLAMP(params->hdr.ootf.knee_y, 0, 1); + T = PL_CLAMP(params->hdr.ootf.target_luma, params->input_min, params->input_max); + N = params->hdr.ootf.num_anchors + 1; + pl_assert(N < PL_ARRAY_SIZE(P)); + memcpy(P + 1, params->hdr.ootf.anchors, (N - 1) * sizeof(*P)); + P[0] = 0.0f; + P[N] = 1.0f; + + } else { + + // Missing metadata, default to simple brightness matching + float src_knee, dst_knee; + st2094_pick_knee(&src_knee, &dst_knee, params); + Kx = src_knee / params->input_max; + Ky = dst_knee / params->output_max; + + // Solve spline to match slope at knee intercept + const float slope = Ky / Kx * (1 - Kx) / (1 - Ky); + N = PL_CLAMP((int) ceilf(slope), 2, PL_ARRAY_SIZE(P) - 1); + P[0] = 0.0f; + P[1] = st2094_intercept(N, Kx, Ky); + for (int i = 2; i <= N; i++) + P[i] = 1.0f; + T = D; + + } + + if (D < T) { + + // Output display darker than OOTF target, make brighter + const float Dmin = 0.0f, u = fmaxf(0.0f, (D - Dmin) / (T - Dmin)); + + // Scale down the knee point to make more room for the OOTF + Kx *= u; + Ky *= u; + + // Make the slope of the knee more closely approximate a clip(), + // constrained to avoid exploding P[1] + const float beta = N * Kx / (1 - Kx); + const float Kxy = fminf(Kx * params->input_max / D, beta / (beta + 1)); + Ky = PL_MIX(Kxy, Ky, u); + + for (int p = 2; p <= N; p++) + P[p] = PL_MIX(1.0f, P[p], u); + + // Make the OOTF intercept linear as D -> Dmin + P[1] = PL_MIX(st2094_intercept(N, Kx, Ky), P[1], u); + + } else if (D > T) { + + // Output display brighter than OOTF target, make more linear + pl_assert(params->input_max > T); + const float w = powf(1 - (D - T) / (params->input_max - T), 1.4f); + + // Constrain the slope of the input knee to prevent it from + // exploding and making the picture way too bright + Ky *= T / D; + + // Make the slope of the knee more linear by solving for f(Kx) = Kx + float Kxy = Kx * D / params->input_max; + Ky = PL_MIX(Kxy, Ky, w); + + for (int p = 2; p < N; p++) { + float anchor_lin = (float) p / N; + P[p] = PL_MIX(anchor_lin, P[p], w); + } + + // Make the OOTF intercept linear as D -> input_max + P[1] = PL_MIX(st2094_intercept(N, Kx, Ky), P[1], w); + + } + + pl_assert(Kx >= 0 && Kx <= 1); + pl_assert(Ky >= 0 && Ky <= 1); + + FOREACH_LUT(lut, x) { + x = bt1886_oetf(x, params->input_min, params->input_max); + x = bt1886_eotf(x, 0.0f, 1.0f); + + if (x <= Kx && Kx) { + // Linear section + x *= Ky / Kx; + } else { + // Bezier section + const float t = (x - Kx) / (1 - Kx); + + x = 0; // Bn + for (uint8_t p = 0; p <= N; p++) + x += binom[N][p] * powf(t, p) * powf(1 - t, N - p) * P[p]; + + x = Ky + (1 - Ky) * x; + } + + x = bt1886_oetf(x, 0.0f, 1.0f); + x = bt1886_eotf(x, params->output_min, params->output_max); + } +} + +const struct pl_tone_map_function pl_tone_map_st2094_40 = { + .name = "st2094-40", + .description = "SMPTE ST 2094-40 Annex B", + .param_desc = "Knee point target", + .param_min = 0.00f, + .param_def = 0.70f, + .param_max = 1.00f, + .scaling = PL_HDR_NITS, + .map = st2094_40, +}; + +static void st2094_10(float *lut, const struct pl_tone_map_params *params) +{ + float src_knee, dst_knee; + st2094_pick_knee(&src_knee, &dst_knee, params); + + const float x1 = params->input_min; + const float x3 = params->input_max; + const float x2 = src_knee; + + const float y1 = params->output_min; + const float y3 = params->output_max; + const float y2 = dst_knee; + + const pl_matrix3x3 cmat = {{ + { x2*x3*(y2 - y3), x1*x3*(y3 - y1), x1*x2*(y1 - y2) }, + { x3*y3 - x2*y2, x1*y1 - x3*y3, x2*y2 - x1*y1 }, + { x3 - x2, x1 - x3, x2 - x1 }, + }}; + + float coeffs[3] = { y1, y2, y3 }; + pl_matrix3x3_apply(&cmat, coeffs); + + const float k = 1.0 / (x3*y3*(x1 - x2) + x2*y2*(x3 - x1) + x1*y1*(x2 - x3)); + const float c1 = k * coeffs[0]; + const float c2 = k * coeffs[1]; + const float c3 = k * coeffs[2]; + + FOREACH_LUT(lut, x) + x = (c1 + c2 * x) / (1 + c3 * x); +} + +const struct pl_tone_map_function pl_tone_map_st2094_10 = { + .name = "st2094-10", + .description = "SMPTE ST 2094-10 Annex B.2", + .param_desc = "Knee point target", + .param_min = 0.00f, + .param_def = 0.70f, + .param_max = 1.00f, + .scaling = PL_HDR_NITS, + .map = st2094_10, +}; + +static void bt2390(float *lut, const struct pl_tone_map_params *params) +{ + const float minLum = rescale_in(params->output_min, params); + const float maxLum = rescale_in(params->output_max, params); + const float offset = params->constants.knee_offset; + const float ks = (1 + offset) * maxLum - offset; + const float bp = minLum > 0 ? fminf(1 / minLum, 4) : 4; + const float gain_inv = 1 + minLum / maxLum * powf(1 - maxLum, bp); + const float gain = maxLum < 1 ? 1 / gain_inv : 1; + + FOREACH_LUT(lut, x) { + x = rescale_in(x, params); + + // Piece-wise hermite spline + if (ks < 1) { + float tb = (x - ks) / (1 - ks); + float tb2 = tb * tb; + float tb3 = tb2 * tb; + float pb = (2 * tb3 - 3 * tb2 + 1) * ks + + (tb3 - 2 * tb2 + tb) * (1 - ks) + + (-2 * tb3 + 3 * tb2) * maxLum; + x = x < ks ? x : pb; + } + + // Black point adaptation + if (x < 1) { + x += minLum * powf(1 - x, bp); + x = gain * (x - minLum) + minLum; + } + + x = x * (params->input_max - params->input_min) + params->input_min; + } +} + +const struct pl_tone_map_function pl_tone_map_bt2390 = { + .name = "bt2390", + .description = "ITU-R BT.2390 EETF", + .scaling = PL_HDR_PQ, + .param_desc = "Knee offset", + .param_min = 0.50, + .param_def = 1.00, + .param_max = 2.00, + .map = bt2390, +}; + +static void bt2446a(float *lut, const struct pl_tone_map_params *params) +{ + const float phdr = 1 + 32 * powf(params->input_max / 10000, 1/2.4f); + const float psdr = 1 + 32 * powf(params->output_max / 10000, 1/2.4f); + + FOREACH_LUT(lut, x) { + x = powf(rescale_in(x, params), 1/2.4f); + x = logf(1 + (phdr - 1) * x) / logf(phdr); + + if (x <= 0.7399f) { + x = 1.0770f * x; + } else if (x < 0.9909f) { + x = (-1.1510f * x + 2.7811f) * x - 0.6302f; + } else { + x = 0.5f * x + 0.5f; + } + + x = (powf(psdr, x) - 1) / (psdr - 1); + x = bt1886_eotf(x, params->output_min, params->output_max); + } +} + +static void bt2446a_inv(float *lut, const struct pl_tone_map_params *params) +{ + FOREACH_LUT(lut, x) { + x = bt1886_oetf(x, params->input_min, params->input_max); + x *= 255.0; + if (x > 70) { + x = powf(x, (2.8305e-6f * x - 7.4622e-4f) * x + 1.2528f); + } else { + x = powf(x, (1.8712e-5f * x - 2.7334e-3f) * x + 1.3141f); + } + x = powf(x / 1000, 2.4f); + x = rescale_out(x, params); + } +} + +const struct pl_tone_map_function pl_tone_map_bt2446a = { + .name = "bt2446a", + .description = "ITU-R BT.2446 Method A", + .scaling = PL_HDR_NITS, + .map = bt2446a, + .map_inverse = bt2446a_inv, +}; + +static void spline(float *lut, const struct pl_tone_map_params *params) +{ + float src_pivot, dst_pivot; + st2094_pick_knee(&src_pivot, &dst_pivot, params); + + // Solve for linear knee (Pa = 0) + float slope = (dst_pivot - params->output_min) / + (src_pivot - params->input_min); + + // Tune the slope at the knee point slightly: raise it to a user-provided + // gamma exponent, multiplied by an extra tuning coefficient designed to + // make the slope closer to 1.0 when the difference in peaks is low, and + // closer to linear when the difference between peaks is high. + float ratio = params->input_max / params->output_max - 1.0f; + ratio = fclampf(params->constants.slope_tuning * ratio, + params->constants.slope_offset, + 1.0f + params->constants.slope_offset); + slope = powf(slope, (1.0f - params->constants.spline_contrast) * ratio); + + // Normalize everything the pivot to make the math easier + const float in_min = params->input_min - src_pivot; + const float in_max = params->input_max - src_pivot; + const float out_min = params->output_min - dst_pivot; + const float out_max = params->output_max - dst_pivot; + + // Solve P of order 2 for: + // P(in_min) = out_min + // P'(0.0) = slope + // P(0.0) = 0.0 + const float Pa = (out_min - slope * in_min) / (in_min * in_min); + const float Pb = slope; + + // Solve Q of order 3 for: + // Q(in_max) = out_max + // Q''(in_max) = 0.0 + // Q(0.0) = 0.0 + // Q'(0.0) = slope + const float t = 2 * in_max * in_max; + const float Qa = (slope * in_max - out_max) / (in_max * t); + const float Qb = -3 * (slope * in_max - out_max) / t; + const float Qc = slope; + + FOREACH_LUT(lut, x) { + x -= src_pivot; + x = x > 0 ? ((Qa * x + Qb) * x + Qc) * x : (Pa * x + Pb) * x; + x += dst_pivot; + } +} + +const struct pl_tone_map_function pl_tone_map_spline = { + .name = "spline", + .description = "Single-pivot polynomial spline", + .param_desc = "Contrast", + .param_min = 0.00f, + .param_def = 0.50f, + .param_max = 1.50f, + .scaling = PL_HDR_PQ, + .map = spline, + .map_inverse = spline, +}; + +static void reinhard(float *lut, const struct pl_tone_map_params *params) +{ + const float peak = rescale(params->input_max, params), + contrast = params->constants.reinhard_contrast, + offset = (1.0 - contrast) / contrast, + scale = (peak + offset) / peak; + + FOREACH_LUT(lut, x) { + x = rescale(x, params); + x = x / (x + offset); + x *= scale; + x = rescale_out(x, params); + } +} + +const struct pl_tone_map_function pl_tone_map_reinhard = { + .name = "reinhard", + .description = "Reinhard", + .param_desc = "Contrast", + .param_min = 0.001, + .param_def = 0.50, + .param_max = 0.99, + .map = reinhard, +}; + +static void mobius(float *lut, const struct pl_tone_map_params *params) +{ + const float peak = rescale(params->input_max, params), + j = params->constants.linear_knee; + + // Solve for M(j) = j; M(peak) = 1.0; M'(j) = 1.0 + // where M(x) = scale * (x+a)/(x+b) + const float a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak); + const float b = (j*j - 2.0f * j * peak + peak) / + fmaxf(1e-6f, peak - 1.0f); + const float scale = (b*b + 2.0f * b*j + j*j) / (b - a); + + FOREACH_LUT(lut, x) { + x = rescale(x, params); + x = x <= j ? x : scale * (x + a) / (x + b); + x = rescale_out(x, params); + } +} + +const struct pl_tone_map_function pl_tone_map_mobius = { + .name = "mobius", + .description = "Mobius", + .param_desc = "Knee point", + .param_min = 0.00, + .param_def = 0.30, + .param_max = 0.99, + .map = mobius, +}; + +static inline float hable(float x) +{ + const float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30; + return ((x * (A*x + C*B) + D*E) / (x * (A*x + B) + D*F)) - E/F; +} + +static void hable_map(float *lut, const struct pl_tone_map_params *params) +{ + const float peak = params->input_max / params->output_max, + scale = 1.0f / hable(peak); + + FOREACH_LUT(lut, x) { + x = bt1886_oetf(x, params->input_min, params->input_max); + x = bt1886_eotf(x, 0, peak); + x = scale * hable(x); + x = bt1886_oetf(x, 0, 1); + x = bt1886_eotf(x, params->output_min, params->output_max); + } +} + +const struct pl_tone_map_function pl_tone_map_hable = { + .name = "hable", + .description = "Filmic tone-mapping (Hable)", + .map = hable_map, +}; + +static void gamma_map(float *lut, const struct pl_tone_map_params *params) +{ + const float peak = rescale(params->input_max, params), + cutoff = params->constants.linear_knee, + gamma = logf(cutoff) / logf(cutoff / peak); + + FOREACH_LUT(lut, x) { + x = rescale(x, params); + x = x > cutoff ? powf(x / peak, gamma) : x; + x = rescale_out(x, params); + } +} + +const struct pl_tone_map_function pl_tone_map_gamma = { + .name = "gamma", + .description = "Gamma function with knee", + .param_desc = "Knee point", + .param_min = 0.001, + .param_def = 0.30, + .param_max = 1.00, + .map = gamma_map, +}; + +static void linear(float *lut, const struct pl_tone_map_params *params) +{ + const float gain = params->constants.exposure; + + FOREACH_LUT(lut, x) { + x = rescale_in(x, params); + x *= gain; + x = rescale_out(x, params); + } +} + +const struct pl_tone_map_function pl_tone_map_linear = { + .name = "linear", + .description = "Perceptually linear stretch", + .param_desc = "Exposure", + .param_min = 0.001, + .param_def = 1.00, + .param_max = 10.0, + .scaling = PL_HDR_PQ, + .map = linear, + .map_inverse = linear, +}; + +const struct pl_tone_map_function pl_tone_map_linear_light = { + .name = "linearlight", + .description = "Linear light stretch", + .param_desc = "Exposure", + .param_min = 0.001, + .param_def = 1.00, + .param_max = 10.0, + .scaling = PL_HDR_NORM, + .map = linear, + .map_inverse = linear, +}; + +const struct pl_tone_map_function * const pl_tone_map_functions[] = { + &pl_tone_map_clip, + &pl_tone_map_st2094_40, + &pl_tone_map_st2094_10, + &pl_tone_map_bt2390, + &pl_tone_map_bt2446a, + &pl_tone_map_spline, + &pl_tone_map_reinhard, + &pl_tone_map_mobius, + &pl_tone_map_hable, + &pl_tone_map_gamma, + &pl_tone_map_linear, + &pl_tone_map_linear_light, + NULL +}; + +const int pl_num_tone_map_functions = PL_ARRAY_SIZE(pl_tone_map_functions) - 1; + +const struct pl_tone_map_function *pl_find_tone_map_function(const char *name) +{ + for (int i = 0; i < pl_num_tone_map_functions; i++) { + if (strcmp(name, pl_tone_map_functions[i]->name) == 0) + return pl_tone_map_functions[i]; + } + + return NULL; +} diff --git a/src/ucrt_math.def b/src/ucrt_math.def new file mode 100644 index 0000000..f7d000d --- /dev/null +++ b/src/ucrt_math.def @@ -0,0 +1,292 @@ +LIBRARY api-ms-win-crt-math-l1-1-0 +EXPORTS +_Cbuild +_Cmulcc +_Cmulcr +_FCbuild +_FCmulcc +_FCmulcr +_LCbuild +_LCmulcc +_LCmulcr +__setusermatherr +_cabs +_chgsign +_chgsignf +_copysign +_copysignf +_d_int +_dclass +_dexp +_dlog +_dnorm +_dpcomp +_dpoly +_dscale +_dsign +_dsin +_dtest +_dunscale +_except1 +_fd_int +_fdclass +_fdexp +_fdlog +_fdnorm +_fdopen +_fdpcomp +_fdpoly +_fdscale +_fdsign +_fdsin +_fdtest +_fdunscale +_finite +_finitef +_fpclass +_fpclassf +_get_FMA3_enable +_hypot +_hypotf +_isnan +_isnanf +_j0 +_j1 +_jn +_ld_int +_ldclass +_ldexp +_ldlog +_ldpcomp +_ldpoly +_ldscale +_ldsign +_ldsin +_ldtest +_ldunscale +_logb +_logbf +_nextafter +_nextafterf +_scalb +_scalbf +_set_FMA3_enable +_y0 +_y1 +_yn +acos +acosf +acosh +acoshf +acoshl +asin +asinf +asinh +asinhf +asinhl +atan +atan2 +atan2f +atanf +atanh +atanhf +atanhl +cabs +cabsf +cabsl +cacos +cacosf +cacosh +cacoshf +cacoshl +cacosl +carg +cargf +cargl +casin +casinf +casinh +casinhf +casinhl +casinl +catan +catanf +catanh +catanhf +catanhl +catanl +cbrt +cbrtf +cbrtl +ccos +ccosf +ccosh +ccoshf +ccoshl +ccosl +ceil +ceilf +cexp +cexpf +cexpl +cimag +cimagf +cimagl +clog +clog10 +clog10f +clog10l +clogf +clogl +conj +conjf +conjl +copysign +copysignf +copysignl +cos +cosf +cosh +coshf +cpow +cpowf +cpowl +cproj +cprojf +cprojl +creal +crealf +creall +csin +csinf +csinh +csinhf +csinhl +csinl +csqrt +csqrtf +csqrtl +ctan +ctanf +ctanh +ctanhf +ctanhl +ctanl +erf +erfc +erfcf +erfcl +erff +erfl +exp +exp2 +exp2f +exp2l +expf +expm1 +expm1f +expm1l +fabs +fdim +fdimf +fdiml +floor +floorf +fma +fmaf +fmal +fmax +fmaxf +fmaxl +fmin +fminf +fminl +fmod +fmodf +frexp +hypot +ilogb +ilogbf +ilogbl +ldexp +lgamma +lgammaf +lgammal +llrint +llrintf +llrintl +llround +llroundf +llroundl +log +log10 +log10f +log1p +log1pf +log1pl +log2 +log2f +log2l +logb +logbf +logbl +logf +lrint +lrintf +lrintl +lround +lroundf +lroundl +modf +modff +nan +nanf +nanl +nearbyint +nearbyintf +nearbyintl +nextafter +nextafterf +nextafterl +nexttoward +nexttowardf +nexttowardl +norm +normf +norml +pow +powf +remainder +remainderf +remainderl +remquo +remquof +remquol +rint +rintf +rintl +round +roundf +roundl +scalbln +scalblnf +scalblnl +scalbn +scalbnf +scalbnl +sin +sinf +sinh +sinhf +sqrt +sqrtf +tan +tanf +tanh +tanhf +tgamma +tgammaf +tgammal +trunc +truncf +truncl diff --git a/src/utils/dolbyvision.c b/src/utils/dolbyvision.c new file mode 100644 index 0000000..3798532 --- /dev/null +++ b/src/utils/dolbyvision.c @@ -0,0 +1,63 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include <libplacebo/utils/dolbyvision.h> + +#ifdef PL_HAVE_LIBDOVI +#include <libplacebo/tone_mapping.h> +#include <libdovi/rpu_parser.h> +#endif + +void pl_hdr_metadata_from_dovi_rpu(struct pl_hdr_metadata *out, + const uint8_t *buf, size_t size) +{ +#ifdef PL_HAVE_LIBDOVI + if (buf && size) { + DoviRpuOpaque *rpu = + dovi_parse_unspec62_nalu(buf, size); + const DoviRpuDataHeader *header = dovi_rpu_get_header(rpu); + + if (header && header->vdr_dm_metadata_present_flag) { + // Profile 4 reshaping isn't done as it is a dual layer format. + // However there are still unknowns on its EOTF, so it cannot be enabled. + // + // For profile 7, the brightness metadata can still be used as most + // titles are going to have accurate metadata<->image brightness, + // with the exception of some titles that require the enhancement layer + // to be processed to restore the intended brightness, which would then + // match the metadata values. + if (header->guessed_profile == 4) { + goto done; + } + + const DoviVdrDmData *vdr_dm_data = dovi_rpu_get_vdr_dm_data(rpu); + if (vdr_dm_data->dm_data.level1) { + const DoviExtMetadataBlockLevel1 *l1 = vdr_dm_data->dm_data.level1; + out->max_pq_y = l1->max_pq / 4095.0f; + out->avg_pq_y = l1->avg_pq / 4095.0f; + } + + dovi_rpu_free_vdr_dm_data(vdr_dm_data); + } + + done: + dovi_rpu_free_header(header); + dovi_rpu_free(rpu); + } +#endif +} diff --git a/src/utils/frame_queue.c b/src/utils/frame_queue.c new file mode 100644 index 0000000..0155983 --- /dev/null +++ b/src/utils/frame_queue.c @@ -0,0 +1,1030 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <errno.h> +#include <math.h> + +#include "common.h" +#include "log.h" +#include "pl_thread.h" + +#include <libplacebo/utils/frame_queue.h> + +struct cache_entry { + pl_tex tex[4]; +}; + +struct entry { + pl_rc_t rc; + double pts; + struct cache_entry cache; + struct pl_source_frame src; + struct pl_frame frame; + uint64_t signature; + bool mapped; + bool ok; + + // for interlaced frames + enum pl_field field; + struct entry *primary; + struct entry *prev, *next; + bool dirty; +}; + +// Hard limits for vsync timing validity +#define MIN_FPS 10 +#define MAX_FPS 400 + +// Limits for FPS estimation state +#define MAX_SAMPLES 32 +#define MIN_SAMPLES 4 + +// Stickiness to prevent `interpolation_threshold` oscillation +#define THRESHOLD_MAX_RATIO 0.3 +#define THRESHOLD_FRAMES 5 + +// Maximum number of not-yet-mapped frames to allow queueing in advance +#define PREFETCH_FRAMES 2 + +struct pool { + float samples[MAX_SAMPLES]; + float estimate; + float sum; + int idx; + int num; + int total; +}; + +struct pl_queue_t { + pl_gpu gpu; + pl_log log; + + // For multi-threading, we use two locks. The `lock_weak` guards the queue + // state itself. The `lock_strong` has a bigger scope and should be held + // for the duration of any functions that expect the queue state to + // remain more or less valid (with the exception of adding new members). + // + // In particular, `pl_queue_reset` and `pl_queue_update` will take + // the strong lock, while `pl_queue_push_*` will only take the weak + // lock. + pl_mutex lock_strong; + pl_mutex lock_weak; + pl_cond wakeup; + + // Frame queue and state + PL_ARRAY(struct entry *) queue; + uint64_t signature; + int threshold_frames; + bool want_frame; + bool eof; + + // Average vsync/frame fps estimation state + struct pool vps, fps; + float reported_vps; + float reported_fps; + double prev_pts; + + // Storage for temporary arrays + PL_ARRAY(uint64_t) tmp_sig; + PL_ARRAY(float) tmp_ts; + PL_ARRAY(const struct pl_frame *) tmp_frame; + + // Queue of GPU objects to reuse + PL_ARRAY(struct cache_entry) cache; +}; + +pl_queue pl_queue_create(pl_gpu gpu) +{ + pl_queue p = pl_alloc_ptr(NULL, p); + *p = (struct pl_queue_t) { + .gpu = gpu, + .log = gpu->log, + }; + + pl_mutex_init(&p->lock_strong); + pl_mutex_init(&p->lock_weak); + int ret = pl_cond_init(&p->wakeup); + if (ret) { + PL_ERR(p, "Failed to init conditional variable: %d", ret); + return NULL; + } + return p; +} + +static void recycle_cache(pl_queue p, struct cache_entry *cache, bool recycle) +{ + bool has_textures = false; + for (int i = 0; i < PL_ARRAY_SIZE(cache->tex); i++) { + if (!cache->tex[i]) + continue; + + has_textures = true; + if (recycle) { + pl_tex_invalidate(p->gpu, cache->tex[i]); + } else { + pl_tex_destroy(p->gpu, &cache->tex[i]); + } + } + + if (recycle && has_textures) + PL_ARRAY_APPEND(p, p->cache, *cache); + + memset(cache, 0, sizeof(*cache)); // sanity +} + +static void entry_deref(pl_queue p, struct entry **pentry, bool recycle) +{ + struct entry *entry = *pentry; + *pentry = NULL; + if (!entry || !pl_rc_deref(&entry->rc)) + return; + + if (!entry->mapped && entry->src.discard) { + PL_TRACE(p, "Discarding unused frame id %"PRIu64" with PTS %f", + entry->signature, entry->src.pts); + entry->src.discard(&entry->src); + } + + if (entry->mapped && entry->ok && entry->src.unmap) { + PL_TRACE(p, "Unmapping frame id %"PRIu64" with PTS %f", + entry->signature, entry->src.pts); + entry->src.unmap(p->gpu, &entry->frame, &entry->src); + } + + recycle_cache(p, &entry->cache, recycle); + pl_free(entry); +} + +static struct entry *entry_ref(struct entry *entry) +{ + pl_rc_ref(&entry->rc); + return entry; +} + +static void entry_cull(pl_queue p, struct entry *entry, bool recycle) +{ + // Forcibly clean up references to prev/next frames, even if `entry` has + // remaining refs pointing at it. This is to prevent cyclic references. + entry_deref(p, &entry->primary, recycle); + entry_deref(p, &entry->prev, recycle); + entry_deref(p, &entry->next, recycle); + entry_deref(p, &entry, recycle); +} + +void pl_queue_destroy(pl_queue *queue) +{ + pl_queue p = *queue; + if (!p) + return; + + for (int n = 0; n < p->queue.num; n++) + entry_cull(p, p->queue.elem[n], false); + for (int n = 0; n < p->cache.num; n++) { + for (int i = 0; i < PL_ARRAY_SIZE(p->cache.elem[n].tex); i++) + pl_tex_destroy(p->gpu, &p->cache.elem[n].tex[i]); + } + + pl_cond_destroy(&p->wakeup); + pl_mutex_destroy(&p->lock_weak); + pl_mutex_destroy(&p->lock_strong); + pl_free(p); + *queue = NULL; +} + +void pl_queue_reset(pl_queue p) +{ + pl_mutex_lock(&p->lock_strong); + pl_mutex_lock(&p->lock_weak); + + for (int i = 0; i < p->queue.num; i++) + entry_cull(p, p->queue.elem[i], false); + + *p = (struct pl_queue_t) { + .gpu = p->gpu, + .log = p->log, + + // Reuse lock objects + .lock_strong = p->lock_strong, + .lock_weak = p->lock_weak, + .wakeup = p->wakeup, + + // Explicitly preserve allocations + .queue.elem = p->queue.elem, + .tmp_sig.elem = p->tmp_sig.elem, + .tmp_ts.elem = p->tmp_ts.elem, + .tmp_frame.elem = p->tmp_frame.elem, + + // Reuse GPU object cache entirely + .cache = p->cache, + }; + + pl_cond_signal(&p->wakeup); + pl_mutex_unlock(&p->lock_weak); + pl_mutex_unlock(&p->lock_strong); +} + +static inline float delta(float old, float new) +{ + return fabsf((new - old) / PL_MIN(new, old)); +} + +static inline void default_estimate(struct pool *pool, float val) +{ + if (!pool->estimate && isnormal(val) && val > 0.0) + pool->estimate = val; +} + +static inline void update_estimate(struct pool *pool, float cur) +{ + if (pool->num) { + static const float max_delta = 0.3; + if (delta(pool->sum / pool->num, cur) > max_delta) { + pool->sum = 0.0; + pool->num = pool->idx = 0; + } + } + + if (pool->num++ == MAX_SAMPLES) { + pool->sum -= pool->samples[pool->idx]; + pool->num--; + } + + pool->sum += pool->samples[pool->idx] = cur; + pool->idx = (pool->idx + 1) % MAX_SAMPLES; + pool->total++; + + if (pool->total < MIN_SAMPLES || pool->num >= MIN_SAMPLES) + pool->estimate = pool->sum / pool->num; +} + +static void queue_push(pl_queue p, const struct pl_source_frame *src) +{ + if (p->eof && !src) + return; // ignore duplicate EOF + + if (p->eof && src) { + PL_INFO(p, "Received frame after EOF signaled... discarding frame!"); + if (src->discard) + src->discard(src); + return; + } + + pl_cond_signal(&p->wakeup); + + if (!src) { + PL_TRACE(p, "Received EOF, draining frame queue..."); + p->eof = true; + p->want_frame = false; + return; + } + + // Update FPS estimates if possible/reasonable + default_estimate(&p->fps, src->first_field ? src->duration / 2 : src->duration); + if (p->queue.num) { + double last_pts = p->queue.elem[p->queue.num - 1]->pts; + float delta = src->pts - last_pts; + if (delta <= 0.0f) { + PL_DEBUG(p, "Non monotonically increasing PTS %f -> %f", last_pts, src->pts); + } else if (p->fps.estimate && delta > 10.0 * p->fps.estimate) { + PL_DEBUG(p, "Discontinuous source PTS jump %f -> %f", last_pts, src->pts); + } else { + update_estimate(&p->fps, delta); + } + } else if (src->pts != 0) { + PL_DEBUG(p, "First frame received with non-zero PTS %f", src->pts); + } + + struct entry *entry = pl_alloc_ptr(NULL, entry); + *entry = (struct entry) { + .signature = p->signature++, + .pts = src->pts, + .src = *src, + }; + pl_rc_init(&entry->rc); + PL_ARRAY_POP(p->cache, &entry->cache); + PL_TRACE(p, "Added new frame id %"PRIu64" with PTS %f", + entry->signature, entry->pts); + + // Insert new entry into the correct spot in the queue, sorted by PTS + for (int i = p->queue.num;; i--) { + if (i == 0 || p->queue.elem[i - 1]->pts <= entry->pts) { + if (src->first_field == PL_FIELD_NONE) { + // Progressive + PL_ARRAY_INSERT_AT(p, p->queue, i, entry); + break; + } else { + // Interlaced + struct entry *prev = i > 0 ? p->queue.elem[i - 1] : NULL; + struct entry *next = i < p->queue.num ? p->queue.elem[i] : NULL; + struct entry *entry2 = pl_zalloc_ptr(NULL, entry2); + pl_rc_init(&entry2->rc); + if (next) { + entry2->pts = (entry->pts + next->pts) / 2; + } else if (src->duration) { + entry2->pts = entry->pts + src->duration / 2; + } else if (p->fps.estimate) { + entry2->pts = entry->pts + p->fps.estimate; + } else { + PL_ERR(p, "Frame with PTS %f specified as interlaced, but " + "no FPS information known yet! Please specify a " + "valid `pl_source_frame.duration`. Treating as " + "progressive...", src->pts); + PL_ARRAY_INSERT_AT(p, p->queue, i, entry); + pl_free(entry2); + break; + } + + entry->field = src->first_field; + entry2->primary = entry_ref(entry); + entry2->field = pl_field_other(entry->field); + entry2->signature = p->signature++; + + PL_TRACE(p, "Added second field id %"PRIu64" with PTS %f", + entry2->signature, entry2->pts); + + // Link previous/next frames + if (prev) { + entry->prev = entry_ref(PL_DEF(prev->primary, prev)); + entry2->prev = entry_ref(PL_DEF(prev->primary, prev)); + // Retroactively re-link the previous frames that should + // be referencing this frame + for (int j = i - 1; j >= 0; --j) { + struct entry *e = p->queue.elem[j]; + if (e != prev && e != prev->primary) + break; + entry_deref(p, &e->next, true); + e->next = entry_ref(entry); + if (e->dirty) { // reset signature to signal change + e->signature = p->signature++; + e->dirty = false; + } + } + } + + if (next) { + entry->next = entry_ref(PL_DEF(next->primary, next)); + entry2->next = entry_ref(PL_DEF(next->primary, next)); + for (int j = i; j < p->queue.num; j++) { + struct entry *e = p->queue.elem[j]; + if (e != next && e != next->primary) + break; + entry_deref(p, &e->prev, true); + e->prev = entry_ref(entry); + if (e->dirty) { + e->signature = p->signature++; + e->dirty = false; + } + } + } + + PL_ARRAY_INSERT_AT(p, p->queue, i, entry); + PL_ARRAY_INSERT_AT(p, p->queue, i+1, entry2); + break; + } + } + } + + p->want_frame = false; +} + +void pl_queue_push(pl_queue p, const struct pl_source_frame *frame) +{ + pl_mutex_lock(&p->lock_weak); + queue_push(p, frame); + pl_mutex_unlock(&p->lock_weak); +} + +static inline bool entry_mapped(struct entry *entry) +{ + return entry->mapped || (entry->primary && entry->primary->mapped); +} + +static bool queue_has_room(pl_queue p) +{ + if (p->want_frame) + return true; + + int wanted_frames = PREFETCH_FRAMES; + if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS) + wanted_frames += ceilf(p->vps.estimate / p->fps.estimate) - 1; + + // Examine the queue tail + for (int i = p->queue.num - 1; i >= 0; i--) { + if (entry_mapped(p->queue.elem[i])) + return true; + if (p->queue.num - i >= wanted_frames) + return false; + } + + return true; +} + +bool pl_queue_push_block(pl_queue p, uint64_t timeout, + const struct pl_source_frame *frame) +{ + pl_mutex_lock(&p->lock_weak); + if (!timeout || !frame || p->eof) + goto skip_blocking; + + while (!queue_has_room(p) && !p->eof) { + if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, timeout) == ETIMEDOUT) { + pl_mutex_unlock(&p->lock_weak); + return false; + } + } + +skip_blocking: + + queue_push(p, frame); + pl_mutex_unlock(&p->lock_weak); + return true; +} + +static void report_estimates(pl_queue p) +{ + if (p->fps.total >= MIN_SAMPLES && p->vps.total >= MIN_SAMPLES) { + if (p->reported_fps && p->reported_vps) { + // Only re-report the estimates if they've changed considerably + // from the previously reported values + static const float report_delta = 0.3f; + float delta_fps = delta(p->reported_fps, p->fps.estimate); + float delta_vps = delta(p->reported_vps, p->vps.estimate); + if (delta_fps < report_delta && delta_vps < report_delta) + return; + } + + PL_INFO(p, "Estimated source FPS: %.3f, display FPS: %.3f", + 1.0 / p->fps.estimate, 1.0 / p->vps.estimate); + + p->reported_fps = p->fps.estimate; + p->reported_vps = p->vps.estimate; + } +} + +// note: may add more than one frame, since it releases the lock +static enum pl_queue_status get_frame(pl_queue p, const struct pl_queue_params *params) +{ + if (p->eof) + return PL_QUEUE_EOF; + + if (!params->get_frame) { + if (!params->timeout) + return PL_QUEUE_MORE; + + p->want_frame = true; + pl_cond_signal(&p->wakeup); + + while (p->want_frame) { + if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, params->timeout) == ETIMEDOUT) + return PL_QUEUE_MORE; + } + + return p->eof ? PL_QUEUE_EOF : PL_QUEUE_OK; + } + + // Don't hold the weak mutex while calling into `get_frame`, to allow + // `pl_queue_push` to run concurrently while we're waiting for frames + pl_mutex_unlock(&p->lock_weak); + + struct pl_source_frame src; + enum pl_queue_status ret; + switch ((ret = params->get_frame(&src, params))) { + case PL_QUEUE_OK: + pl_queue_push(p, &src); + break; + case PL_QUEUE_EOF: + pl_queue_push(p, NULL); + break; + case PL_QUEUE_MORE: + case PL_QUEUE_ERR: + break; + } + + pl_mutex_lock(&p->lock_weak); + return ret; +} + +static inline bool map_frame(pl_queue p, struct entry *entry) +{ + if (!entry->mapped) { + PL_TRACE(p, "Mapping frame id %"PRIu64" with PTS %f", + entry->signature, entry->pts); + entry->mapped = true; + entry->ok = entry->src.map(p->gpu, entry->cache.tex, + &entry->src, &entry->frame); + if (!entry->ok) + PL_ERR(p, "Failed mapping frame id %"PRIu64" with PTS %f", + entry->signature, entry->pts); + } + + return entry->ok; +} + +static bool map_entry(pl_queue p, struct entry *entry) +{ + bool ok = map_frame(p, entry->primary ? entry->primary : entry); + if (entry->prev) + ok &= map_frame(p, entry->prev); + if (entry->next) + ok &= map_frame(p, entry->next); + if (!ok) + return false; + + if (entry->primary) + entry->frame = entry->primary->frame; + + if (entry->field) { + entry->frame.field = entry->field; + entry->frame.first_field = PL_DEF(entry->primary, entry)->src.first_field; + entry->frame.prev = entry->prev ? &entry->prev->frame : NULL; + entry->frame.next = entry->next ? &entry->next->frame : NULL; + entry->dirty = true; + } + + return true; +} + +static bool entry_complete(struct entry *entry) +{ + return entry->field ? !!entry->next : true; +} + +// Advance the queue as needed to make sure idx 0 is the last frame before +// `pts`, and idx 1 is the first frame after `pts` (unless this is the last). +// +// Returns PL_QUEUE_OK only if idx 0 is still legal under ZOH semantics. +static enum pl_queue_status advance(pl_queue p, double pts, + const struct pl_queue_params *params) +{ + // Cull all frames except the last frame before `pts` + int culled = 0; + for (int i = 1; i < p->queue.num; i++) { + if (p->queue.elem[i]->pts <= pts) { + entry_cull(p, p->queue.elem[i - 1], true); + culled++; + } + } + PL_ARRAY_REMOVE_RANGE(p->queue, 0, culled); + + // Keep adding new frames until we find one in the future, or EOF + enum pl_queue_status ret = PL_QUEUE_OK; + while (p->queue.num < 2) { + switch ((ret = get_frame(p, params))) { + case PL_QUEUE_ERR: + return ret; + case PL_QUEUE_EOF: + if (!p->queue.num) + return ret; + goto done; + case PL_QUEUE_MORE: + case PL_QUEUE_OK: + while (p->queue.num > 1 && p->queue.elem[1]->pts <= pts) { + entry_cull(p, p->queue.elem[0], true); + PL_ARRAY_REMOVE_AT(p->queue, 0); + } + if (ret == PL_QUEUE_MORE) + return ret; + continue; + } + } + + if (!entry_complete(p->queue.elem[1])) { + switch (get_frame(p, params)) { + case PL_QUEUE_ERR: + return PL_QUEUE_ERR; + case PL_QUEUE_MORE: + ret = PL_QUEUE_MORE; + // fall through + case PL_QUEUE_EOF: + case PL_QUEUE_OK: + goto done; + } + } + +done: + if (p->eof && p->queue.num == 1) { + if (p->queue.elem[0]->pts == 0.0 || !p->fps.estimate) { + // If the last frame has PTS 0.0, or we have no FPS estimate, then + // this is probably a single-frame file, in which case we want to + // extend the ZOH to infinity, rather than returning. Not a perfect + // heuristic, but w/e + return PL_QUEUE_OK; + } + + // Last frame is held for an extra `p->fps.estimate` duration, + // afterwards this function just returns EOF. + if (pts < p->queue.elem[0]->pts + p->fps.estimate) { + ret = PL_QUEUE_OK; + } else { + entry_cull(p, p->queue.elem[0], true); + p->queue.num = 0; + return PL_QUEUE_EOF; + } + } + + pl_assert(p->queue.num); + return ret; +} + +static inline enum pl_queue_status point(pl_queue p, struct pl_frame_mix *mix, + const struct pl_queue_params *params) +{ + if (!p->queue.num) { + *mix = (struct pl_frame_mix) {0}; + return PL_QUEUE_MORE; + } + + // Find closest frame (nearest neighbour semantics) + struct entry *entry = p->queue.elem[0]; + if (entry->pts > params->pts) { // first frame not visible yet + *mix = (struct pl_frame_mix) {0}; + return PL_QUEUE_OK; + } + + double best = fabs(entry->pts - params->pts); + for (int i = 1; i < p->queue.num; i++) { + double dist = fabs(p->queue.elem[i]->pts - params->pts); + if (dist < best) { + entry = p->queue.elem[i]; + best = dist; + continue; + } else { + break; + } + } + + if (!map_entry(p, entry)) + return PL_QUEUE_ERR; + + // Return a mix containing only this single frame + p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0; + PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature); + PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame); + PL_ARRAY_APPEND(p, p->tmp_ts, 0.0); + *mix = (struct pl_frame_mix) { + .num_frames = 1, + .frames = p->tmp_frame.elem, + .signatures = p->tmp_sig.elem, + .timestamps = p->tmp_ts.elem, + .vsync_duration = 1.0, + }; + + PL_TRACE(p, "Showing single frame id %"PRIu64" with PTS %f for target PTS %f", + entry->signature, entry->pts, params->pts); + + report_estimates(p); + return PL_QUEUE_OK; +} + +// Present a single frame as appropriate for `pts` +static enum pl_queue_status nearest(pl_queue p, struct pl_frame_mix *mix, + const struct pl_queue_params *params) +{ + enum pl_queue_status ret; + switch ((ret = advance(p, params->pts, params))) { + case PL_QUEUE_ERR: + case PL_QUEUE_EOF: + return ret; + case PL_QUEUE_OK: + case PL_QUEUE_MORE: + if (mix && point(p, mix, params) == PL_QUEUE_ERR) + return PL_QUEUE_ERR; + return ret; + } + + pl_unreachable(); +} + +// Special case of `interpolate` for radius = 0, in which case we need exactly +// the previous frame and the following frame +static enum pl_queue_status oversample(pl_queue p, struct pl_frame_mix *mix, + const struct pl_queue_params *params) +{ + enum pl_queue_status ret; + switch ((ret = advance(p, params->pts, params))) { + case PL_QUEUE_ERR: + case PL_QUEUE_EOF: + return ret; + case PL_QUEUE_OK: + break; + case PL_QUEUE_MORE: + if (!p->queue.num) { + if (mix) + *mix = (struct pl_frame_mix) {0}; + return ret; + } + break; + } + + if (!mix) + return PL_QUEUE_OK; + + // Can't oversample with only a single frame, fall back to point sampling + if (p->queue.num < 2 || p->queue.elem[0]->pts > params->pts) { + if (point(p, mix, params) != PL_QUEUE_OK) + return PL_QUEUE_ERR; + return ret; + } + + struct entry *entries[2] = { p->queue.elem[0], p->queue.elem[1] }; + pl_assert(entries[0]->pts <= params->pts); + pl_assert(entries[1]->pts >= params->pts); + + // Returning a mix containing both of these two frames + p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0; + for (int i = 0; i < 2; i++) { + if (!map_entry(p, entries[i])) + return PL_QUEUE_ERR; + float ts = (entries[i]->pts - params->pts) / p->fps.estimate; + PL_ARRAY_APPEND(p, p->tmp_sig, entries[i]->signature); + PL_ARRAY_APPEND(p, p->tmp_frame, &entries[i]->frame); + PL_ARRAY_APPEND(p, p->tmp_ts, ts); + } + + *mix = (struct pl_frame_mix) { + .num_frames = 2, + .frames = p->tmp_frame.elem, + .signatures = p->tmp_sig.elem, + .timestamps = p->tmp_ts.elem, + .vsync_duration = p->vps.estimate / p->fps.estimate, + }; + + PL_TRACE(p, "Oversampling 2 frames for target PTS %f:", params->pts); + for (int i = 0; i < mix->num_frames; i++) + PL_TRACE(p, " id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]); + + report_estimates(p); + return ret; +} + +// Present a mixture of frames, relative to the vsync ratio +static enum pl_queue_status interpolate(pl_queue p, struct pl_frame_mix *mix, + const struct pl_queue_params *params) +{ + // No FPS estimate available, possibly source contains only a single frame, + // or this is the first frame to be rendered. Fall back to point sampling. + if (!p->fps.estimate) + return nearest(p, mix, params); + + // Silently disable interpolation if the ratio dips lower than the + // configured threshold + float ratio = fabs(p->fps.estimate / p->vps.estimate - 1.0); + if (ratio < params->interpolation_threshold) { + if (!p->threshold_frames) { + PL_INFO(p, "Detected fps ratio %.4f below threshold %.4f, " + "disabling interpolation", + ratio, params->interpolation_threshold); + } + + p->threshold_frames = THRESHOLD_FRAMES + 1; + return nearest(p, mix, params); + } else if (ratio < THRESHOLD_MAX_RATIO && p->threshold_frames > 1) { + p->threshold_frames--; + return nearest(p, mix, params); + } else { + if (p->threshold_frames) { + PL_INFO(p, "Detected fps ratio %.4f exceeds threshold %.4f, " + "re-enabling interpolation", + ratio, params->interpolation_threshold); + } + p->threshold_frames = 0; + } + + // No radius information, special case in which we only need the previous + // and next frames. + if (!params->radius) + return oversample(p, mix, params); + + pl_assert(p->fps.estimate && p->vps.estimate); + float radius = params->radius * fmaxf(1.0f, p->vps.estimate / p->fps.estimate); + double min_pts = params->pts - radius * p->fps.estimate, + max_pts = params->pts + radius * p->fps.estimate; + + enum pl_queue_status ret; + switch ((ret = advance(p, min_pts, params))) { + case PL_QUEUE_ERR: + case PL_QUEUE_EOF: + return ret; + case PL_QUEUE_MORE: + goto done; + case PL_QUEUE_OK: + break; + } + + // Keep adding new frames until we've covered the range we care about + pl_assert(p->queue.num); + while (p->queue.elem[p->queue.num - 1]->pts < max_pts) { + switch ((ret = get_frame(p, params))) { + case PL_QUEUE_ERR: + return ret; + case PL_QUEUE_MORE: + goto done; + case PL_QUEUE_EOF:; + // Don't forward EOF until we've held the last frame for the + // desired ZOH hold duration + double last_pts = p->queue.elem[p->queue.num - 1]->pts; + if (last_pts && params->pts >= last_pts + p->fps.estimate) + return ret; + ret = PL_QUEUE_OK; + goto done; + case PL_QUEUE_OK: + continue; + } + } + + if (!entry_complete(p->queue.elem[p->queue.num - 1])) { + switch ((ret = get_frame(p, params))) { + case PL_QUEUE_MORE: + case PL_QUEUE_OK: + break; + case PL_QUEUE_ERR: + case PL_QUEUE_EOF: + return ret; + } + } + +done: ; + + if (!mix) + return PL_QUEUE_OK; + + // Construct a mix object representing the current queue state, starting at + // the last frame before `min_pts` to make sure there's a fallback frame + // available for ZOH semantics. + p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0; + for (int i = 0; i < p->queue.num; i++) { + struct entry *entry = p->queue.elem[i]; + if (entry->pts > max_pts) + break; + if (!map_entry(p, entry)) + return PL_QUEUE_ERR; + float ts = (entry->pts - params->pts) / p->fps.estimate; + PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature); + PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame); + PL_ARRAY_APPEND(p, p->tmp_ts, ts); + } + + *mix = (struct pl_frame_mix) { + .num_frames = p->tmp_frame.num, + .frames = p->tmp_frame.elem, + .signatures = p->tmp_sig.elem, + .timestamps = p->tmp_ts.elem, + .vsync_duration = p->vps.estimate / p->fps.estimate, + }; + + PL_TRACE(p, "Showing mix of %d frames for target PTS %f:", + mix->num_frames, params->pts); + for (int i = 0; i < mix->num_frames; i++) + PL_TRACE(p, " id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]); + + report_estimates(p); + return ret; +} + +static bool prefill(pl_queue p, const struct pl_queue_params *params) +{ + int min_frames = 2 * ceilf(params->radius); + if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS) + min_frames *= ceilf(p->vps.estimate / p->fps.estimate); + min_frames = PL_MAX(min_frames, PREFETCH_FRAMES); + + while (p->queue.num < min_frames) { + switch (get_frame(p, params)) { + case PL_QUEUE_ERR: + return false; + case PL_QUEUE_EOF: + case PL_QUEUE_MORE: + return true; + case PL_QUEUE_OK: + continue; + } + } + + // In the most likely case, the first few frames will all be required. So + // force-map them all to initialize GPU state on initial rendering. This is + // better than the alternative of missing the cache later, when timing is + // more relevant. + for (int i = 0; i < min_frames; i++) { + if (!map_entry(p, p->queue.elem[i])) + return false; + } + + return true; +} + +enum pl_queue_status pl_queue_update(pl_queue p, struct pl_frame_mix *out_mix, + const struct pl_queue_params *params) +{ + pl_mutex_lock(&p->lock_strong); + pl_mutex_lock(&p->lock_weak); + default_estimate(&p->vps, params->vsync_duration); + + float delta = params->pts - p->prev_pts; + if (delta < 0.0f) { + + // This is a backwards PTS jump. This is something we can handle + // semi-gracefully, but only if we haven't culled past the current + // frame yet. + if (p->queue.num && p->queue.elem[0]->pts > params->pts) { + PL_ERR(p, "Requested PTS %f is lower than the oldest frame " + "PTS %f. This is not supported, PTS must be monotonically " + "increasing! Please use `pl_queue_reset` to reset the frame " + "queue on discontinuous PTS jumps.", + params->pts, p->queue.elem[0]->pts); + pl_mutex_unlock(&p->lock_weak); + pl_mutex_unlock(&p->lock_strong); + return PL_QUEUE_ERR; + } + + } else if (delta > 1.0f) { + + // A jump of more than a second is probably the result of a + // discontinuous jump after a suspend. To prevent this from exploding + // the FPS estimate, treat this as a new frame. + PL_TRACE(p, "Discontinuous target PTS jump %f -> %f, ignoring...", + p->prev_pts, params->pts); + + } else if (delta > 0) { + + update_estimate(&p->vps, params->pts - p->prev_pts); + + } + + p->prev_pts = params->pts; + + // As a special case, prefill the queue if this is the first frame + if (!params->pts && !p->queue.num) { + if (!prefill(p, params)) { + pl_mutex_unlock(&p->lock_weak); + pl_mutex_unlock(&p->lock_strong); + return PL_QUEUE_ERR; + } + } + + // Ignore unrealistically high or low FPS, common near start of playback + static const float max_vsync = 1.0 / MIN_FPS; + static const float min_vsync = 1.0 / MAX_FPS; + bool estimation_ok = p->vps.estimate > min_vsync && p->vps.estimate < max_vsync; + enum pl_queue_status ret; + + if (estimation_ok || params->vsync_duration > 0) { + // We know the vsync duration, so construct an interpolation mix + ret = interpolate(p, out_mix, params); + } else { + // We don't know the vsync duration (yet), so just point-sample + ret = nearest(p, out_mix, params); + } + + pl_cond_signal(&p->wakeup); + pl_mutex_unlock(&p->lock_weak); + pl_mutex_unlock(&p->lock_strong); + return ret; +} + +float pl_queue_estimate_fps(pl_queue p) +{ + pl_mutex_lock(&p->lock_weak); + float estimate = p->fps.estimate; + pl_mutex_unlock(&p->lock_weak); + return estimate ? 1.0f / estimate : 0.0f; +} + +float pl_queue_estimate_vps(pl_queue p) +{ + pl_mutex_lock(&p->lock_weak); + float estimate = p->vps.estimate; + pl_mutex_unlock(&p->lock_weak); + return estimate ? 1.0f / estimate : 0.0f; +} + +int pl_queue_num_frames(pl_queue p) +{ + pl_mutex_lock(&p->lock_weak); + int count = p->queue.num; + pl_mutex_unlock(&p->lock_weak); + return count; +} + +bool pl_queue_peek(pl_queue p, int idx, struct pl_source_frame *out) +{ + pl_mutex_lock(&p->lock_weak); + bool ok = idx >= 0 && idx < p->queue.num; + if (ok) + *out = p->queue.elem[idx]->src; + pl_mutex_unlock(&p->lock_weak); + return ok; +} diff --git a/src/utils/upload.c b/src/utils/upload.c new file mode 100644 index 0000000..75bd4bb --- /dev/null +++ b/src/utils/upload.c @@ -0,0 +1,382 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "log.h" +#include "common.h" +#include "gpu.h" + +#include <libplacebo/utils/upload.h> + +#define MAX_COMPS 4 + +struct comp { + int order; // e.g. 0, 1, 2, 3 for RGBA + int size; // size in bits + int shift; // bit-shift / offset in bits +}; + +static int compare_comp(const void *pa, const void *pb) +{ + const struct comp *a = pa, *b = pb; + + // Move all of the components with a size of 0 to the end, so they can + // be ignored outright + if (a->size && !b->size) + return -1; + if (b->size && !a->size) + return 1; + + // Otherwise, just compare based on the shift + return PL_CMP(a->shift, b->shift); +} + +void pl_plane_data_from_comps(struct pl_plane_data *data, int size[4], + int shift[4]) +{ + struct comp comps[MAX_COMPS]; + for (int i = 0; i < PL_ARRAY_SIZE(comps); i++) { + comps[i].order = i; + comps[i].size = size[i]; + comps[i].shift = shift[i]; + } + + // Sort the components by shift + qsort(comps, MAX_COMPS, sizeof(struct comp), compare_comp); + + // Generate the resulting component size/pad/map + int offset = 0; + for (int i = 0; i < MAX_COMPS; i++) { + if (comps[i].size) { + assert(comps[i].shift >= offset); + data->component_size[i] = comps[i].size; + data->component_pad[i] = comps[i].shift - offset; + data->component_map[i] = comps[i].order; + offset += data->component_size[i] + data->component_pad[i]; + } else { + // Clear the superfluous entries for sanity + data->component_size[i] = 0; + data->component_pad[i] = 0; + data->component_map[i] = 0; + } + } +} + +void pl_plane_data_from_mask(struct pl_plane_data *data, uint64_t mask[4]) +{ + int size[4]; + int shift[4]; + + for (int i = 0; i < PL_ARRAY_SIZE(size); i++) { + size[i] = __builtin_popcountll(mask[i]); + shift[i] = PL_MAX(0, __builtin_ffsll(mask[i]) - 1); + + // Sanity checking + uint64_t mask_reconstructed = (1LLU << size[i]) - 1; + mask_reconstructed <<= shift[i]; + pl_assert(mask_reconstructed == mask[i]); + } + + pl_plane_data_from_comps(data, size, shift); +} + +bool pl_plane_data_align(struct pl_plane_data *data, struct pl_bit_encoding *out_bits) +{ + struct pl_plane_data aligned = *data; + struct pl_bit_encoding bits = {0}; + + int offset = 0; + +#define SET_TEST(var, value) \ + do { \ + if (offset == 0) { \ + (var) = (value); \ + } else if ((var) != (value)) { \ + goto misaligned; \ + } \ + } while (0) + + for (int i = 0; i < MAX_COMPS; i++) { + if (!aligned.component_size[i]) + break; + + // Can't meaningfully align alpha channel, so just skip it. This is a + // limitation of the fact that `pl_bit_encoding` only applies to the + // main color channels, and changing this would be very nontrivial. + if (aligned.component_map[i] == PL_CHANNEL_A) + continue; + + // Color depth is the original component size, before alignment + SET_TEST(bits.color_depth, aligned.component_size[i]); + + // Try consuming padding of the current component to align down. This + // corresponds to an extra bit shift to the left. + int comp_start = offset + aligned.component_pad[i]; + int left_delta = comp_start - PL_ALIGN2(comp_start - 7, 8); + left_delta = PL_MIN(left_delta, aligned.component_pad[i]); + aligned.component_pad[i] -= left_delta; + aligned.component_size[i] += left_delta; + SET_TEST(bits.bit_shift, left_delta); + + // Try consuming padding of the next component to align up. This + // corresponds to simply ignoring some extra 0s on the end. + int comp_end = comp_start + aligned.component_size[i] - left_delta; + int right_delta = PL_ALIGN2(comp_end, 8) - comp_end; + if (i+1 == MAX_COMPS || !aligned.component_size[i+1]) { + // This is the last component, so we can be greedy + aligned.component_size[i] += right_delta; + } else { + right_delta = PL_MIN(right_delta, aligned.component_pad[i+1]); + aligned.component_pad[i+1] -= right_delta; + aligned.component_size[i] += right_delta; + } + + // Sample depth is the new total component size, including padding + SET_TEST(bits.sample_depth, aligned.component_size[i]); + + offset += aligned.component_pad[i] + aligned.component_size[i]; + } + + // Easy sanity check, to make sure that we don't exceed the known stride + if (aligned.pixel_stride && offset > aligned.pixel_stride * 8) + goto misaligned; + + *data = aligned; + if (out_bits) + *out_bits = bits; + return true; + +misaligned: + // Can't properly align anything, so just do a no-op + if (out_bits) + *out_bits = (struct pl_bit_encoding) {0}; + return false; +} + +pl_fmt pl_plane_find_fmt(pl_gpu gpu, int out_map[4], const struct pl_plane_data *data) +{ + int dummy[4] = {0}; + out_map = PL_DEF(out_map, dummy); + + // Endian swapping requires compute shaders (currently) + if (data->swapped && !gpu->limits.max_ssbo_size) + return NULL; + + // Count the number of components and initialize out_map + int num = 0; + for (int i = 0; i < PL_ARRAY_SIZE(data->component_size); i++) { + out_map[i] = -1; + if (data->component_size[i]) + num = i+1; + } + + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + if (fmt->opaque || fmt->num_components < num) + continue; + if (fmt->type != data->type || fmt->texel_size != data->pixel_stride) + continue; + if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE)) + continue; + + int idx = 0; + + // Try mapping all pl_plane_data components to texture components + for (int i = 0; i < num; i++) { + // If there's padding we have to map it to an unused physical + // component first + int pad = data->component_pad[i]; + if (pad && (idx >= 4 || fmt->host_bits[idx++] != pad)) + goto next_fmt; + + // Otherwise, try and match this component + int size = data->component_size[i]; + if (size && (idx >= 4 || fmt->host_bits[idx] != size)) + goto next_fmt; + out_map[idx++] = data->component_map[i]; + } + + // Reject misaligned formats, check this last to only log such errors + // if this is the only thing preventing a format from being used, as + // this is likely an issue in the API usage. + if (data->row_stride % fmt->texel_align) { + PL_WARN(gpu, "Rejecting texture format '%s' due to misalignment: " + "Row stride %zu is not a clean multiple of texel size %zu! " + "This is likely an API usage bug.", + fmt->name, data->row_stride, fmt->texel_align); + continue; + } + + return fmt; + +next_fmt: ; // acts as `continue` + } + + return NULL; +} + +bool pl_upload_plane(pl_gpu gpu, struct pl_plane *out_plane, + pl_tex *tex, const struct pl_plane_data *data) +{ + pl_assert(!data->buf ^ !data->pixels); // exactly one + + int out_map[4]; + pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data); + if (!fmt) { + PL_ERR(gpu, "Failed picking any compatible texture format for a plane!"); + return false; + + // TODO: try soft-converting to a supported format using e.g zimg? + } + + bool ok = pl_tex_recreate(gpu, tex, pl_tex_params( + .w = data->width, + .h = data->height, + .format = fmt, + .sampleable = true, + .host_writable = true, + .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE, + )); + + if (!ok) { + PL_ERR(gpu, "Failed initializing plane texture!"); + return false; + } + + if (out_plane) { + out_plane->texture = *tex; + out_plane->components = 0; + for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) { + out_plane->component_mapping[i] = out_map[i]; + if (out_map[i] >= 0) + out_plane->components = i+1; + } + } + + struct pl_tex_transfer_params params = { + .tex = *tex, + .rc.x1 = data->width, // set these for `pl_tex_transfer_size` + .rc.y1 = data->height, + .rc.z1 = 1, + .row_pitch = PL_DEF(data->row_stride, data->width * fmt->texel_size), + .ptr = (void *) data->pixels, + .buf = data->buf, + .buf_offset = data->buf_offset, + .callback = data->callback, + .priv = data->priv, + }; + + pl_buf swapbuf = NULL; + if (data->swapped) { + const size_t aligned = PL_ALIGN2(pl_tex_transfer_size(¶ms), 4); + swapbuf = pl_buf_create(gpu, pl_buf_params( + .size = aligned, + .storable = true, + .initial_data = params.ptr, + + // Note: This may over-read from `ptr` if `ptr` is not aligned to a + // word boundary, but the extra texels will be ignored by + // `pl_tex_upload` so this UB should be a non-issue in practice. + )); + if (!swapbuf) { + PL_ERR(gpu, "Failed creating endian swapping buffer!"); + return false; + } + + struct pl_buf_copy_swap_params swap_params = { + .src = swapbuf, + .dst = swapbuf, + .size = aligned, + .wordsize = fmt->texel_size / fmt->num_components, + }; + + bool can_reuse = params.buf && params.buf->params.storable && + params.buf_offset % 4 == 0 && + params.buf_offset + aligned <= params.buf->params.size; + + if (params.ptr) { + // Data is already uploaded (no-op), can swap in-place + } else if (can_reuse) { + // We can sample directly from the source buffer + swap_params.src = params.buf; + swap_params.src_offset = params.buf_offset; + } else { + // We sadly need to do a second memcpy + assert(params.buf); + PL_TRACE(gpu, "Double-slow path! pl_buf_copy -> pl_buf_copy_swap..."); + pl_buf_copy(gpu, swapbuf, 0, params.buf, params.buf_offset, + PL_MIN(aligned, params.buf->params.size - params.buf_offset)); + } + + if (!pl_buf_copy_swap(gpu, &swap_params)) { + PL_ERR(gpu, "Failed swapping endianness!"); + pl_buf_destroy(gpu, &swapbuf); + return false; + } + + params.ptr = NULL; + params.buf = swapbuf; + params.buf_offset = 0; + } + + ok = pl_tex_upload(gpu, ¶ms); + pl_buf_destroy(gpu, &swapbuf); + return ok; +} + +bool pl_recreate_plane(pl_gpu gpu, struct pl_plane *out_plane, + pl_tex *tex, const struct pl_plane_data *data) +{ + if (data->swapped) { + PL_ERR(gpu, "Cannot call pl_recreate_plane on non-native endian plane " + "data, this is only supported for `pl_upload_plane`!"); + return false; + } + + int out_map[4]; + pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data); + if (!fmt) { + PL_ERR(gpu, "Failed picking any compatible texture format for a plane!"); + return false; + } + + bool ok = pl_tex_recreate(gpu, tex, pl_tex_params( + .w = data->width, + .h = data->height, + .format = fmt, + .renderable = true, + .host_readable = fmt->caps & PL_FMT_CAP_HOST_READABLE, + .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE, + .storable = fmt->caps & PL_FMT_CAP_STORABLE, + )); + + if (!ok) { + PL_ERR(gpu, "Failed initializing plane texture!"); + return false; + } + + if (out_plane) { + out_plane->texture = *tex; + out_plane->components = 0; + for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) { + out_plane->component_mapping[i] = out_map[i]; + if (out_map[i] >= 0) + out_plane->components = i+1; + } + } + + return true; +} diff --git a/src/version.h.in b/src/version.h.in new file mode 100644 index 0000000..22bdee8 --- /dev/null +++ b/src/version.h.in @@ -0,0 +1 @@ +#define BUILD_VERSION "@buildver@" diff --git a/src/vulkan/command.c b/src/vulkan/command.c new file mode 100644 index 0000000..5020aff --- /dev/null +++ b/src/vulkan/command.c @@ -0,0 +1,571 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "command.h" +#include "utils.h" + +// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error +static VkResult vk_cmd_poll(struct vk_cmd *cmd, uint64_t timeout) +{ + struct vk_ctx *vk = cmd->pool->vk; + return vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &cmd->sync.sem, + .pValues = &cmd->sync.value, + }, timeout); +} + +static void flush_callbacks(struct vk_ctx *vk) +{ + while (vk->num_pending_callbacks) { + const struct vk_callback *cb = vk->pending_callbacks++; + vk->num_pending_callbacks--; + cb->run(cb->priv, cb->arg); + } +} + +static void vk_cmd_reset(struct vk_cmd *cmd) +{ + struct vk_ctx *vk = cmd->pool->vk; + + // Flush possible callbacks left over from a previous command still in the + // process of being reset, whose callback triggered this command being + // reset. + flush_callbacks(vk); + vk->pending_callbacks = cmd->callbacks.elem; + vk->num_pending_callbacks = cmd->callbacks.num; + flush_callbacks(vk); + + cmd->callbacks.num = 0; + cmd->deps.num = 0; + cmd->sigs.num = 0; +} + +static void vk_cmd_destroy(struct vk_cmd *cmd) +{ + if (!cmd) + return; + + struct vk_ctx *vk = cmd->pool->vk; + vk_cmd_poll(cmd, UINT64_MAX); + vk_cmd_reset(cmd); + vk->DestroySemaphore(vk->dev, cmd->sync.sem, PL_VK_ALLOC); + vk->FreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf); + + pl_free(cmd); +} + +static struct vk_cmd *vk_cmd_create(struct vk_cmdpool *pool) +{ + struct vk_ctx *vk = pool->vk; + struct vk_cmd *cmd = pl_zalloc_ptr(NULL, cmd); + cmd->pool = pool; + + VkCommandBufferAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = pool->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + + VK(vk->AllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf)); + + static const VkSemaphoreTypeCreateInfo stinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, + .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE, + .initialValue = 0, + }; + + static const VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &stinfo, + }; + + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &cmd->sync.sem)); + PL_VK_NAME(SEMAPHORE, cmd->sync.sem, "cmd"); + + return cmd; + +error: + vk_cmd_destroy(cmd); + vk->failed = true; + return NULL; +} + +void vk_dev_callback(struct vk_ctx *vk, vk_cb callback, + const void *priv, const void *arg) +{ + pl_mutex_lock(&vk->lock); + if (vk->cmds_pending.num > 0) { + struct vk_cmd *last_cmd = vk->cmds_pending.elem[vk->cmds_pending.num - 1]; + vk_cmd_callback(last_cmd, callback, priv, arg); + } else { + // The device was already idle, so we can just immediately call it + callback((void *) priv, (void *) arg); + } + pl_mutex_unlock(&vk->lock); +} + +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, + const void *priv, const void *arg) +{ + PL_ARRAY_APPEND(cmd, cmd->callbacks, (struct vk_callback) { + .run = callback, + .priv = (void *) priv, + .arg = (void *) arg, + }); +} + +void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep) +{ + PL_ARRAY_APPEND(cmd, cmd->deps, (VkSemaphoreSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = dep.sem, + .value = dep.value, + .stageMask = stage, + }); +} + +void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig) +{ + VkSemaphoreSubmitInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = sig.sem, + .value = sig.value, + .stageMask = stage, + }; + + // Try updating existing semaphore signal operations in-place + for (int i = 0; i < cmd->sigs.num; i++) { + if (cmd->sigs.elem[i].semaphore == sig.sem) { + pl_assert(sig.value > cmd->sigs.elem[i].value); + cmd->sigs.elem[i] = sinfo; + return; + } + } + + PL_ARRAY_APPEND(cmd, cmd->sigs, sinfo); +} + +#define SET(FLAG, CHECK) \ + if (flags2 & (CHECK)) \ + flags |= FLAG + +static VkAccessFlags lower_access2(VkAccessFlags2 flags2) +{ + VkAccessFlags flags = flags2 & VK_ACCESS_FLAG_BITS_MAX_ENUM; + SET(VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT); + SET(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT); + return flags; +} + +static VkPipelineStageFlags lower_stage2(VkPipelineStageFlags2 flags2) +{ + VkPipelineStageFlags flags = flags2 & VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM; + SET(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_2_COPY_BIT | + VK_PIPELINE_STAGE_2_RESOLVE_BIT | + VK_PIPELINE_STAGE_2_BLIT_BIT | + VK_PIPELINE_STAGE_2_CLEAR_BIT); + SET(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT); + return flags; +} + +#undef SET + +void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info) +{ + struct vk_ctx *vk = cmd->pool->vk; + if (vk->CmdPipelineBarrier2KHR) { + vk->CmdPipelineBarrier2KHR(cmd->buf, info); + return; + } + + pl_assert(!info->pNext); + pl_assert(info->memoryBarrierCount == 0); + pl_assert(info->bufferMemoryBarrierCount + info->imageMemoryBarrierCount == 1); + + if (info->bufferMemoryBarrierCount) { + + const VkBufferMemoryBarrier2 *barr2 = info->pBufferMemoryBarriers; + const VkBufferMemoryBarrier barr = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = barr2->pNext, + .srcAccessMask = lower_access2(barr2->srcAccessMask), + .dstAccessMask = lower_access2(barr2->dstAccessMask), + .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex, + .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex, + .buffer = barr2->buffer, + .offset = barr2->offset, + .size = barr2->size, + }; + + vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask), + lower_stage2(barr2->dstStageMask), + info->dependencyFlags, + 0, NULL, 1, &barr, 0, NULL); + + } else { + + const VkImageMemoryBarrier2 *barr2 = info->pImageMemoryBarriers; + const VkImageMemoryBarrier barr = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = barr2->pNext, + .srcAccessMask = lower_access2(barr2->srcAccessMask), + .dstAccessMask = lower_access2(barr2->dstAccessMask), + .oldLayout = barr2->oldLayout, + .newLayout = barr2->newLayout, + .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex, + .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex, + .image = barr2->image, + .subresourceRange = barr2->subresourceRange, + }; + + vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask), + lower_stage2(barr2->dstStageMask), + info->dependencyFlags, + 0, NULL, 0, NULL, 1, &barr); + } +} + +struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem, + VkPipelineStageFlags2 stage, + VkAccessFlags2 access, bool is_trans) +{ + bool is_write = (access & vk_access_write) || is_trans; + + // Writes need to be synchronized against the last *read* (which is + // transitively synchronized against the last write), reads only + // need to be synchronized against the last write. + struct vk_sync_scope last = sem->write; + if (is_write && sem->read.access) + last = sem->read; + + if (last.queue != cmd->queue) { + if (!is_write && sem->read.queue == cmd->queue) { + // No semaphore needed in this case because the implicit submission + // order execution dependencies already transitively imply a wait + // for the previous write + } else if (last.sync.sem) { + // Image barrier still needs to depend on this stage for implicit + // ordering guarantees to apply properly + vk_cmd_dep(cmd, stage, last.sync); + last.stage = stage; + } + + // Last access is on different queue, so no pipeline barrier needed + last.access = 0; + } + + if (!is_write && sem->read.queue == cmd->queue && + (sem->read.stage & stage) == stage && + (sem->read.access & access) == access) + { + // A past pipeline barrier already covers this access transitively, so + // we don't need to emit another pipeline barrier at all + last.access = 0; + } + + if (is_write) { + sem->write = (struct vk_sync_scope) { + .sync = cmd->sync, + .queue = cmd->queue, + .stage = stage, + .access = access, + }; + + sem->read = (struct vk_sync_scope) { + .sync = cmd->sync, + .queue = cmd->queue, + // no stage or access scope, because no reads happened yet + }; + } else if (sem->read.queue == cmd->queue) { + // Coalesce multiple same-queue reads into a single access scope + sem->read.sync = cmd->sync; + sem->read.stage |= stage; + sem->read.access |= access; + } else { + sem->read = (struct vk_sync_scope) { + .sync = cmd->sync, + .queue = cmd->queue, + .stage = stage, + .access = access, + }; + } + + // We never need to include pipeline barriers for reads, only writes + last.access &= vk_access_write; + return last; +} + +struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum, + VkQueueFamilyProperties props) +{ + struct vk_cmdpool *pool = pl_alloc_ptr(NULL, pool); + *pool = (struct vk_cmdpool) { + .vk = vk, + .props = props, + .qf = qf, + .queues = pl_calloc(pool, qnum, sizeof(VkQueue)), + .num_queues = qnum, + }; + + for (int n = 0; n < qnum; n++) + vk->GetDeviceQueue(vk->dev, qf, n, &pool->queues[n]); + + VkCommandPoolCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | + VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = qf, + }; + + VK(vk->CreateCommandPool(vk->dev, &cinfo, PL_VK_ALLOC, &pool->pool)); + return pool; + +error: + vk_cmdpool_destroy(pool); + vk->failed = true; + return NULL; +} + +void vk_cmdpool_destroy(struct vk_cmdpool *pool) +{ + if (!pool) + return; + + for (int i = 0; i < pool->cmds.num; i++) + vk_cmd_destroy(pool->cmds.elem[i]); + + struct vk_ctx *vk = pool->vk; + vk->DestroyCommandPool(vk->dev, pool->pool, PL_VK_ALLOC); + pl_free(pool); +} + +struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag) +{ + struct vk_ctx *vk = pool->vk; + + // Garbage collect the cmdpool first, to increase the chances of getting + // an already-available command buffer. + vk_poll_commands(vk, 0); + + struct vk_cmd *cmd = NULL; + pl_mutex_lock(&vk->lock); + if (!PL_ARRAY_POP(pool->cmds, &cmd)) { + cmd = vk_cmd_create(pool); + if (!cmd) { + pl_mutex_unlock(&vk->lock); + goto error; + } + } + + cmd->qindex = pool->idx_queues; + cmd->queue = pool->queues[cmd->qindex]; + pl_mutex_unlock(&vk->lock); + + VkCommandBufferBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + VK(vk->BeginCommandBuffer(cmd->buf, &binfo)); + + debug_tag = PL_DEF(debug_tag, "vk_cmd"); + PL_VK_NAME_HANDLE(COMMAND_BUFFER, cmd->buf, debug_tag); + PL_VK_NAME(SEMAPHORE, cmd->sync.sem, debug_tag); + + cmd->sync.value++; + vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, cmd->sync); + return cmd; + +error: + // Something has to be seriously messed up if we get to this point + vk_cmd_destroy(cmd); + vk->failed = true; + return NULL; +} + +static VkResult vk_queue_submit2(struct vk_ctx *vk, VkQueue queue, + const VkSubmitInfo2 *info2, VkFence fence) +{ + if (vk->QueueSubmit2KHR) + return vk->QueueSubmit2KHR(queue, 1, info2, fence); + + const uint32_t num_deps = info2->waitSemaphoreInfoCount; + const uint32_t num_sigs = info2->signalSemaphoreInfoCount; + const uint32_t num_cmds = info2->commandBufferInfoCount; + + void *tmp = pl_tmp(NULL); + VkSemaphore *deps = pl_calloc_ptr(tmp, num_deps, deps); + VkPipelineStageFlags *masks = pl_calloc_ptr(tmp, num_deps, masks); + uint64_t *depvals = pl_calloc_ptr(tmp, num_deps, depvals); + VkSemaphore *sigs = pl_calloc_ptr(tmp, num_sigs, sigs); + uint64_t *sigvals = pl_calloc_ptr(tmp, num_sigs, sigvals); + VkCommandBuffer *cmds = pl_calloc_ptr(tmp, num_cmds, cmds); + + for (int i = 0; i < num_deps; i++) { + deps[i] = info2->pWaitSemaphoreInfos[i].semaphore; + masks[i] = info2->pWaitSemaphoreInfos[i].stageMask; + depvals[i] = info2->pWaitSemaphoreInfos[i].value; + } + for (int i = 0; i < num_sigs; i++) { + sigs[i] = info2->pSignalSemaphoreInfos[i].semaphore; + sigvals[i] = info2->pSignalSemaphoreInfos[i].value; + } + for (int i = 0; i < num_cmds; i++) + cmds[i] = info2->pCommandBufferInfos[i].commandBuffer; + + const VkTimelineSemaphoreSubmitInfo tinfo = { + .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, + .pNext = info2->pNext, + .waitSemaphoreValueCount = num_deps, + .pWaitSemaphoreValues = depvals, + .signalSemaphoreValueCount = num_sigs, + .pSignalSemaphoreValues = sigvals, + }; + + const VkSubmitInfo info = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = &tinfo, + .waitSemaphoreCount = num_deps, + .pWaitSemaphores = deps, + .pWaitDstStageMask = masks, + .commandBufferCount = num_cmds, + .pCommandBuffers = cmds, + .signalSemaphoreCount = num_sigs, + .pSignalSemaphores = sigs, + }; + + VkResult res = vk->QueueSubmit(queue, 1, &info, fence); + pl_free(tmp); + return res; +} + +bool vk_cmd_submit(struct vk_cmd **pcmd) +{ + struct vk_cmd *cmd = *pcmd; + if (!cmd) + return true; + + *pcmd = NULL; + struct vk_cmdpool *pool = cmd->pool; + struct vk_ctx *vk = pool->vk; + + VK(vk->EndCommandBuffer(cmd->buf)); + + VkSubmitInfo2 sinfo = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, + .waitSemaphoreInfoCount = cmd->deps.num, + .pWaitSemaphoreInfos = cmd->deps.elem, + .signalSemaphoreInfoCount = cmd->sigs.num, + .pSignalSemaphoreInfos = cmd->sigs.elem, + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &(VkCommandBufferSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = cmd->buf, + }, + }; + + if (pl_msg_test(vk->log, PL_LOG_TRACE)) { + PL_TRACE(vk, "Submitting command %p on queue %p (QF %d):", + (void *) cmd->buf, (void *) cmd->queue, pool->qf); + for (int n = 0; n < cmd->deps.num; n++) { + PL_TRACE(vk, " waits on semaphore 0x%"PRIx64" = %"PRIu64, + (uint64_t) cmd->deps.elem[n].semaphore, cmd->deps.elem[n].value); + } + for (int n = 0; n < cmd->sigs.num; n++) { + PL_TRACE(vk, " signals semaphore 0x%"PRIx64" = %"PRIu64, + (uint64_t) cmd->sigs.elem[n].semaphore, cmd->sigs.elem[n].value); + } + if (cmd->callbacks.num) + PL_TRACE(vk, " signals %d callbacks", cmd->callbacks.num); + } + + vk->lock_queue(vk->queue_ctx, pool->qf, cmd->qindex); + VkResult res = vk_queue_submit2(vk, cmd->queue, &sinfo, VK_NULL_HANDLE); + vk->unlock_queue(vk->queue_ctx, pool->qf, cmd->qindex); + PL_VK_ASSERT(res, "vkQueueSubmit2"); + + pl_mutex_lock(&vk->lock); + PL_ARRAY_APPEND(vk->alloc, vk->cmds_pending, cmd); + pl_mutex_unlock(&vk->lock); + return true; + +error: + vk_cmd_reset(cmd); + pl_mutex_lock(&vk->lock); + PL_ARRAY_APPEND(pool, pool->cmds, cmd); + pl_mutex_unlock(&vk->lock); + vk->failed = true; + return false; +} + +bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout) +{ + bool ret = false; + pl_mutex_lock(&vk->lock); + + while (vk->cmds_pending.num) { + struct vk_cmd *cmd = vk->cmds_pending.elem[0]; + struct vk_cmdpool *pool = cmd->pool; + pl_mutex_unlock(&vk->lock); // don't hold mutex while blocking + if (vk_cmd_poll(cmd, timeout) == VK_TIMEOUT) + return ret; + pl_mutex_lock(&vk->lock); + if (!vk->cmds_pending.num || vk->cmds_pending.elem[0] != cmd) + continue; // another thread modified this state while blocking + + PL_TRACE(vk, "VkSemaphore signalled: 0x%"PRIx64" = %"PRIu64, + (uint64_t) cmd->sync.sem, cmd->sync.value); + PL_ARRAY_REMOVE_AT(vk->cmds_pending, 0); // remove before callbacks + vk_cmd_reset(cmd); + PL_ARRAY_APPEND(pool, pool->cmds, cmd); + ret = true; + + // If we've successfully spent some time waiting for at least one + // command, disable the timeout. This has the dual purpose of both + // making sure we don't over-wait due to repeat timeout application, + // but also makes sure we don't block on future commands if we've + // already spend time waiting for one. + timeout = 0; + } + + pl_mutex_unlock(&vk->lock); + return ret; +} + +void vk_rotate_queues(struct vk_ctx *vk) +{ + pl_mutex_lock(&vk->lock); + + // Rotate the queues to ensure good parallelism across frames + for (int i = 0; i < vk->pools.num; i++) { + struct vk_cmdpool *pool = vk->pools.elem[i]; + pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues; + PL_TRACE(vk, "QF %d: %d/%d", pool->qf, pool->idx_queues, pool->num_queues); + } + + pl_mutex_unlock(&vk->lock); +} + +void vk_wait_idle(struct vk_ctx *vk) +{ + while (vk_poll_commands(vk, UINT64_MAX)) ; +} diff --git a/src/vulkan/command.h b/src/vulkan/command.h new file mode 100644 index 0000000..4c70482 --- /dev/null +++ b/src/vulkan/command.h @@ -0,0 +1,142 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once +#include "common.h" + +// Since lots of vulkan operations need to be done lazily once the affected +// resources are no longer in use, provide an abstraction for tracking these. +// In practice, these are only checked and run when submitting new commands, so +// the actual execution may be delayed by a frame. +typedef void (*vk_cb)(void *p, void *arg); + +struct vk_callback { + vk_cb run; + void *priv; + void *arg; +}; + +// Associate a callback with the completion of all currently pending commands. +// This will essentially run once the device is completely idle. +void vk_dev_callback(struct vk_ctx *vk, vk_cb callback, + const void *priv, const void *arg); + +// Helper wrapper around command buffers that also track dependencies, +// callbacks and synchronization primitives +// +// Thread-safety: Unsafe +struct vk_cmd { + struct vk_cmdpool *pool; // pool it was allocated from + pl_vulkan_sem sync; // pending execution, tied to lifetime of device + VkQueue queue; // the submission queue (for recording/pending) + int qindex; // the index of `queue` in `pool` + VkCommandBuffer buf; // the command buffer itself + // Command dependencies and signals. Not owned by the vk_cmd. + PL_ARRAY(VkSemaphoreSubmitInfo) deps; + PL_ARRAY(VkSemaphoreSubmitInfo) sigs; + // "Callbacks" to fire once a command completes. These are used for + // multiple purposes, ranging from resource deallocation to fencing. + PL_ARRAY(struct vk_callback) callbacks; +}; + +// Associate a callback with the completion of the current command. This +// function will be run once the command completes, or shortly thereafter. +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, + const void *priv, const void *arg); + +// Associate a raw dependency for the current command. This semaphore must +// signal by the corresponding stage before the command may execute. +void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep); + +// Associate a raw signal with the current command. This semaphore will signal +// after the given stage completes. +void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig); + +// Compatibility wrappers for vkCmdPipelineBarrier2 (works with pre-1.3) +void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info); + +// Synchronization scope +struct vk_sync_scope { + pl_vulkan_sem sync; // semaphore of last access + VkQueue queue; // source queue of last access + VkPipelineStageFlags2 stage;// stage bitmask of last access + VkAccessFlags2 access; // access type bitmask +}; + +// Synchronization primitive +struct vk_sem { + struct vk_sync_scope read, write; +}; + +// Updates the `vk_sem` state for a given access. If `is_trans` is set, this +// access is treated as a write (since it alters the resource's state). +// +// Returns a struct describing the previous access to a resource. A pipeline +// barrier is only required if the previous access scope is nonzero. +struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem, + VkPipelineStageFlags2 stage, + VkAccessFlags2 access, bool is_trans); + +// Command pool / queue family hybrid abstraction +struct vk_cmdpool { + struct vk_ctx *vk; + VkQueueFamilyProperties props; + int qf; // queue family index + VkCommandPool pool; + VkQueue *queues; + int num_queues; + int idx_queues; + // Command buffers associated with this queue. These are available for + // re-recording + PL_ARRAY(struct vk_cmd *) cmds; +}; + +// Set up a vk_cmdpool corresponding to a queue family. `qnum` may be less than +// `props.queueCount`, to restrict the number of queues in this queue family. +struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum, + VkQueueFamilyProperties props); + +void vk_cmdpool_destroy(struct vk_cmdpool *pool); + +// Fetch a command buffer from a command pool and begin recording to it. +// Returns NULL on failure. +struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag); + +// Finish recording a command buffer and submit it for execution. This function +// takes over ownership of **cmd, and sets *cmd to NULL in doing so. +bool vk_cmd_submit(struct vk_cmd **cmd); + +// Block until some commands complete executing. This is the only function that +// actually processes the callbacks. Will wait at most `timeout` nanoseconds +// for the completion of any command. The timeout may also be passed as 0, in +// which case this function will not block, but only poll for completed +// commands. Returns whether any forward progress was made. +// +// This does *not* flush any queued commands, forgetting to do so may result +// in infinite loops if waiting for the completion of callbacks that were +// never flushed! +bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout); + +// Rotate through queues in each command pool. Call this once per frame, after +// submitting all of the command buffers for that frame. Calling this more +// often than that is possible but bad for performance. +void vk_rotate_queues(struct vk_ctx *vk); + +// Wait until all commands are complete, i.e. the device is idle. This is +// basically equivalent to calling `vk_poll_commands` with a timeout of +// UINT64_MAX until it returns `false`. +void vk_wait_idle(struct vk_ctx *vk); diff --git a/src/vulkan/common.h b/src/vulkan/common.h new file mode 100644 index 0000000..31b309e --- /dev/null +++ b/src/vulkan/common.h @@ -0,0 +1,234 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#define VK_NO_PROTOTYPES +#define VK_ENABLE_BETA_EXTENSIONS // for VK_KHR_portability_subset +#define VK_USE_PLATFORM_METAL_EXT + +#include "../common.h" +#include "../log.h" +#include "../pl_thread.h" + +#include <libplacebo/vulkan.h> + +#ifdef PL_HAVE_WIN32 +#include <windows.h> +#include <vulkan/vulkan_win32.h> +#endif + +// Vulkan allows the optional use of a custom allocator. We don't need one but +// mark this parameter with a better name in case we ever decide to change this +// in the future. (And to make the code more readable) +#define PL_VK_ALLOC NULL + +// Type of a vulkan function that needs to be loaded +#define PL_VK_FUN(name) PFN_vk##name name + +// Load a vulkan instance-level extension function directly (on the stack) +#define PL_VK_LOAD_FUN(inst, name, get_addr) \ + PL_VK_FUN(name) = (PFN_vk##name) get_addr(inst, "vk" #name); + +#ifndef VK_VENDOR_ID_NVIDIA +#define VK_VENDOR_ID_NVIDIA 0x10DE +#endif + +// Shared struct used to hold vulkan context information +struct vk_ctx { + pl_mutex lock; + pl_vulkan vulkan; + void *alloc; // host allocations bound to the lifetime of this vk_ctx + struct vk_malloc *ma; // VRAM malloc layer + pl_vk_inst internal_instance; + pl_log log; + VkInstance inst; + VkPhysicalDevice physd; + VkPhysicalDeviceProperties props; + VkPhysicalDeviceFeatures2 features; + uint32_t api_ver; // device API version + VkDevice dev; + bool imported; // device was not created by us + + // Generic error flag for catching "failed" devices + bool failed; + + // Enabled extensions + PL_ARRAY(const char *) exts; + + // Command pools (one per queue family) + PL_ARRAY(struct vk_cmdpool *) pools; + + // Pointers into `pools` (always set) + struct vk_cmdpool *pool_graphics; + struct vk_cmdpool *pool_compute; + struct vk_cmdpool *pool_transfer; + + // Queue locking functions + PL_ARRAY(PL_ARRAY(pl_mutex)) queue_locks; + void (*lock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx); + void (*unlock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx); + void *queue_ctx; + + // Pending commands. These are shared for the entire mpvk_ctx to ensure + // submission and callbacks are FIFO + PL_ARRAY(struct vk_cmd *) cmds_pending; // submitted but not completed + + // Pending callbacks that still need to be drained before processing + // callbacks for the next command (in case commands are recursively being + // polled from another callback) + const struct vk_callback *pending_callbacks; + int num_pending_callbacks; + + // Instance-level function pointers + PL_VK_FUN(CreateDevice); + PL_VK_FUN(EnumerateDeviceExtensionProperties); + PL_VK_FUN(GetDeviceProcAddr); + PL_VK_FUN(GetInstanceProcAddr); + PL_VK_FUN(GetPhysicalDeviceExternalBufferProperties); + PL_VK_FUN(GetPhysicalDeviceExternalSemaphoreProperties); + PL_VK_FUN(GetPhysicalDeviceFeatures2KHR); + PL_VK_FUN(GetPhysicalDeviceFormatProperties); + PL_VK_FUN(GetPhysicalDeviceFormatProperties2KHR); + PL_VK_FUN(GetPhysicalDeviceImageFormatProperties2KHR); + PL_VK_FUN(GetPhysicalDeviceMemoryProperties); + PL_VK_FUN(GetPhysicalDeviceProperties); + PL_VK_FUN(GetPhysicalDeviceProperties2); + PL_VK_FUN(GetPhysicalDeviceQueueFamilyProperties); + PL_VK_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR); + PL_VK_FUN(GetPhysicalDeviceSurfaceFormatsKHR); + PL_VK_FUN(GetPhysicalDeviceSurfacePresentModesKHR); + PL_VK_FUN(GetPhysicalDeviceSurfaceSupportKHR); + + // Device-level function pointers + PL_VK_FUN(AcquireNextImageKHR); + PL_VK_FUN(AllocateCommandBuffers); + PL_VK_FUN(AllocateDescriptorSets); + PL_VK_FUN(AllocateMemory); + PL_VK_FUN(BeginCommandBuffer); + PL_VK_FUN(BindBufferMemory); + PL_VK_FUN(BindImageMemory); + PL_VK_FUN(CmdBeginDebugUtilsLabelEXT); + PL_VK_FUN(CmdBeginRenderPass); + PL_VK_FUN(CmdBindDescriptorSets); + PL_VK_FUN(CmdBindIndexBuffer); + PL_VK_FUN(CmdBindPipeline); + PL_VK_FUN(CmdBindVertexBuffers); + PL_VK_FUN(CmdBlitImage); + PL_VK_FUN(CmdClearColorImage); + PL_VK_FUN(CmdCopyBuffer); + PL_VK_FUN(CmdCopyBufferToImage); + PL_VK_FUN(CmdCopyImage); + PL_VK_FUN(CmdCopyImageToBuffer); + PL_VK_FUN(CmdDispatch); + PL_VK_FUN(CmdDraw); + PL_VK_FUN(CmdDrawIndexed); + PL_VK_FUN(CmdEndDebugUtilsLabelEXT); + PL_VK_FUN(CmdEndRenderPass); + PL_VK_FUN(CmdPipelineBarrier); + PL_VK_FUN(CmdPipelineBarrier2KHR); + PL_VK_FUN(CmdPushConstants); + PL_VK_FUN(CmdPushDescriptorSetKHR); + PL_VK_FUN(CmdResetQueryPool); + PL_VK_FUN(CmdSetScissor); + PL_VK_FUN(CmdSetViewport); + PL_VK_FUN(CmdUpdateBuffer); + PL_VK_FUN(CmdWriteTimestamp); + PL_VK_FUN(CreateBuffer); + PL_VK_FUN(CreateBufferView); + PL_VK_FUN(CreateCommandPool); + PL_VK_FUN(CreateComputePipelines); + PL_VK_FUN(CreateDebugReportCallbackEXT); + PL_VK_FUN(CreateDescriptorPool); + PL_VK_FUN(CreateDescriptorSetLayout); + PL_VK_FUN(CreateFence); + PL_VK_FUN(CreateFramebuffer); + PL_VK_FUN(CreateGraphicsPipelines); + PL_VK_FUN(CreateImage); + PL_VK_FUN(CreateImageView); + PL_VK_FUN(CreatePipelineCache); + PL_VK_FUN(CreatePipelineLayout); + PL_VK_FUN(CreateQueryPool); + PL_VK_FUN(CreateRenderPass); + PL_VK_FUN(CreateSampler); + PL_VK_FUN(CreateSemaphore); + PL_VK_FUN(CreateShaderModule); + PL_VK_FUN(CreateSwapchainKHR); + PL_VK_FUN(DestroyBuffer); + PL_VK_FUN(DestroyBufferView); + PL_VK_FUN(DestroyCommandPool); + PL_VK_FUN(DestroyDebugReportCallbackEXT); + PL_VK_FUN(DestroyDescriptorPool); + PL_VK_FUN(DestroyDescriptorSetLayout); + PL_VK_FUN(DestroyDevice); + PL_VK_FUN(DestroyFence); + PL_VK_FUN(DestroyFramebuffer); + PL_VK_FUN(DestroyImage); + PL_VK_FUN(DestroyImageView); + PL_VK_FUN(DestroyInstance); + PL_VK_FUN(DestroyPipeline); + PL_VK_FUN(DestroyPipelineCache); + PL_VK_FUN(DestroyPipelineLayout); + PL_VK_FUN(DestroyQueryPool); + PL_VK_FUN(DestroyRenderPass); + PL_VK_FUN(DestroySampler); + PL_VK_FUN(DestroySemaphore); + PL_VK_FUN(DestroyShaderModule); + PL_VK_FUN(DestroySwapchainKHR); + PL_VK_FUN(DeviceWaitIdle); + PL_VK_FUN(EndCommandBuffer); + PL_VK_FUN(FlushMappedMemoryRanges); + PL_VK_FUN(FreeCommandBuffers); + PL_VK_FUN(FreeMemory); + PL_VK_FUN(GetBufferMemoryRequirements); + PL_VK_FUN(GetDeviceQueue); + PL_VK_FUN(GetImageDrmFormatModifierPropertiesEXT); + PL_VK_FUN(GetImageMemoryRequirements2); + PL_VK_FUN(GetImageSubresourceLayout); + PL_VK_FUN(GetMemoryFdKHR); + PL_VK_FUN(GetMemoryFdPropertiesKHR); + PL_VK_FUN(GetMemoryHostPointerPropertiesEXT); + PL_VK_FUN(GetPipelineCacheData); + PL_VK_FUN(GetQueryPoolResults); + PL_VK_FUN(GetSemaphoreFdKHR); + PL_VK_FUN(GetSwapchainImagesKHR); + PL_VK_FUN(InvalidateMappedMemoryRanges); + PL_VK_FUN(MapMemory); + PL_VK_FUN(QueuePresentKHR); + PL_VK_FUN(QueueSubmit); + PL_VK_FUN(QueueSubmit2KHR); + PL_VK_FUN(QueueWaitIdle); + PL_VK_FUN(ResetFences); + PL_VK_FUN(ResetQueryPool); + PL_VK_FUN(SetDebugUtilsObjectNameEXT); + PL_VK_FUN(SetHdrMetadataEXT); + PL_VK_FUN(UpdateDescriptorSets); + PL_VK_FUN(WaitForFences); + PL_VK_FUN(WaitSemaphores); + +#ifdef PL_HAVE_WIN32 + PL_VK_FUN(GetMemoryWin32HandleKHR); + PL_VK_FUN(GetSemaphoreWin32HandleKHR); +#endif + +#ifdef VK_EXT_metal_objects + PL_VK_FUN(ExportMetalObjectsEXT); +#endif +#ifdef VK_EXT_full_screen_exclusive + PL_VK_FUN(AcquireFullScreenExclusiveModeEXT); +#endif +}; diff --git a/src/vulkan/context.c b/src/vulkan/context.c new file mode 100644 index 0000000..ad8a859 --- /dev/null +++ b/src/vulkan/context.c @@ -0,0 +1,1704 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "command.h" +#include "utils.h" +#include "gpu.h" + +#ifdef PL_HAVE_VK_PROC_ADDR +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr( + VkInstance instance, + const char* pName); +#endif + +const struct pl_vk_inst_params pl_vk_inst_default_params = {0}; + +struct vk_fun { + const char *name; + size_t offset; + bool device_level; +}; + +struct vk_ext { + const char *name; + const struct vk_fun *funs; +}; + +#define PL_VK_INST_FUN(N) \ + { .name = "vk" #N, \ + .offset = offsetof(struct vk_ctx, N), \ + } + +#define PL_VK_DEV_FUN(N) \ + { .name = "vk" #N, \ + .offset = offsetof(struct vk_ctx, N), \ + .device_level = true, \ + } + +// Table of optional vulkan instance extensions +static const char *vk_instance_extensions[] = { + VK_KHR_SURFACE_EXTENSION_NAME, + VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME, + VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME, + VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME, +}; + +// List of mandatory instance-level function pointers, including functions +// associated with mandatory instance extensions +static const struct vk_fun vk_inst_funs[] = { + PL_VK_INST_FUN(CreateDevice), + PL_VK_INST_FUN(EnumerateDeviceExtensionProperties), + PL_VK_INST_FUN(GetDeviceProcAddr), + PL_VK_INST_FUN(GetPhysicalDeviceExternalBufferProperties), + PL_VK_INST_FUN(GetPhysicalDeviceExternalSemaphoreProperties), + PL_VK_INST_FUN(GetPhysicalDeviceFeatures2KHR), + PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties), + PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties2KHR), + PL_VK_INST_FUN(GetPhysicalDeviceImageFormatProperties2KHR), + PL_VK_INST_FUN(GetPhysicalDeviceMemoryProperties), + PL_VK_INST_FUN(GetPhysicalDeviceProperties), + PL_VK_INST_FUN(GetPhysicalDeviceProperties2), + PL_VK_INST_FUN(GetPhysicalDeviceQueueFamilyProperties), + + // These are not actually mandatory, but they're universal enough that we + // just load them unconditionally (in lieu of not having proper support for + // loading arbitrary instance extensions). Their use is generally guarded + // behind various VkSurfaceKHR values already being provided by the API + // user (implying this extension is loaded). + PL_VK_INST_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR), + PL_VK_INST_FUN(GetPhysicalDeviceSurfaceFormatsKHR), + PL_VK_INST_FUN(GetPhysicalDeviceSurfacePresentModesKHR), + PL_VK_INST_FUN(GetPhysicalDeviceSurfaceSupportKHR), +}; + +// Table of vulkan device extensions and functions they load, including +// functions exported by dependent instance-level extensions +static const struct vk_ext vk_device_extensions[] = { + { + .name = VK_KHR_SWAPCHAIN_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(AcquireNextImageKHR), + PL_VK_DEV_FUN(CreateSwapchainKHR), + PL_VK_DEV_FUN(DestroySwapchainKHR), + PL_VK_DEV_FUN(GetSwapchainImagesKHR), + PL_VK_DEV_FUN(QueuePresentKHR), + {0} + }, + }, { + .name = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(CmdPushDescriptorSetKHR), + {0} + }, + }, { + .name = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryFdKHR), + {0} + }, + }, { + .name = VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryFdPropertiesKHR), + {0} + }, +#ifdef PL_HAVE_WIN32 + }, { + .name = VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryWin32HandleKHR), + {0} + }, +#endif + }, { + .name = VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryHostPointerPropertiesEXT), + {0} + }, + }, { + .name = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetSemaphoreFdKHR), + {0} + }, +#ifdef PL_HAVE_WIN32 + }, { + .name = VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetSemaphoreWin32HandleKHR), + {0} + }, +#endif + }, { + .name = VK_EXT_PCI_BUS_INFO_EXTENSION_NAME, + }, { + .name = VK_EXT_HDR_METADATA_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(SetHdrMetadataEXT), + {0} + }, + }, { + .name = VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetImageDrmFormatModifierPropertiesEXT), + {0} + }, +#ifdef VK_KHR_portability_subset + }, { + .name = VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME, +#endif +#ifdef VK_EXT_metal_objects + }, { + .name = VK_EXT_METAL_OBJECTS_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(ExportMetalObjectsEXT), + {0} + }, +#endif +#ifdef VK_EXT_full_screen_exclusive + }, { + .name = VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(AcquireFullScreenExclusiveModeEXT), + {0} + }, +#endif + }, { + .name = VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(CmdPipelineBarrier2KHR), + PL_VK_DEV_FUN(QueueSubmit2KHR), + {0} + }, + }, +}; + +// Make sure to keep this in sync with the above! +const char * const pl_vulkan_recommended_extensions[] = { + VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, + VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME, + VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME, +#ifdef PL_HAVE_WIN32 + VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME, +#endif + VK_EXT_PCI_BUS_INFO_EXTENSION_NAME, + VK_EXT_HDR_METADATA_EXTENSION_NAME, + VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME, +#ifdef VK_KHR_portability_subset + VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME, +#endif +#ifdef VK_EXT_metal_objects + VK_EXT_METAL_OBJECTS_EXTENSION_NAME, +#endif +#ifdef VK_EXT_full_screen_exclusive + VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME, +#endif + VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME, +}; + +const int pl_vulkan_num_recommended_extensions = + PL_ARRAY_SIZE(pl_vulkan_recommended_extensions); + +// +1 because VK_KHR_swapchain is not automatically pulled in +static_assert(PL_ARRAY_SIZE(pl_vulkan_recommended_extensions) + 1 == + PL_ARRAY_SIZE(vk_device_extensions), + "pl_vulkan_recommended_extensions out of sync with " + "vk_device_extensions?"); + +// Recommended features; keep in sync with libavutil vulkan hwcontext +static const VkPhysicalDeviceVulkan13Features recommended_vk13 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES, + .computeFullSubgroups = true, + .maintenance4 = true, + .shaderZeroInitializeWorkgroupMemory = true, + .synchronization2 = true, +}; + +static const VkPhysicalDeviceVulkan12Features recommended_vk12 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + .pNext = (void *) &recommended_vk13, + .bufferDeviceAddress = true, + .storagePushConstant8 = true, + .shaderInt8 = true, + .shaderFloat16 = true, + .shaderSharedInt64Atomics = true, + .storageBuffer8BitAccess = true, + .uniformAndStorageBuffer8BitAccess = true, + .vulkanMemoryModel = true, + .vulkanMemoryModelDeviceScope = true, +}; + +static const VkPhysicalDeviceVulkan11Features recommended_vk11 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, + .pNext = (void *) &recommended_vk12, + .samplerYcbcrConversion = true, + .storagePushConstant16 = true, +}; + +const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = (void *) &recommended_vk11, + .features = { + .shaderImageGatherExtended = true, + .shaderStorageImageReadWithoutFormat = true, + .shaderStorageImageWriteWithoutFormat = true, + + // Needed for GPU-assisted validation, but not harmful to enable + .fragmentStoresAndAtomics = true, + .vertexPipelineStoresAndAtomics = true, + .shaderInt64 = true, + } +}; + +// Required features +static const VkPhysicalDeviceVulkan12Features required_vk12 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + .hostQueryReset = true, + .timelineSemaphore = true, +}; + +static const VkPhysicalDeviceVulkan11Features required_vk11 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, + .pNext = (void *) &required_vk12, +}; + +const VkPhysicalDeviceFeatures2 pl_vulkan_required_features = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = (void *) &required_vk11, +}; + +static bool check_required_features(struct vk_ctx *vk) +{ + #define CHECK_FEATURE(maj, min, feat) do { \ + const VkPhysicalDeviceVulkan##maj##min##Features *f; \ + f = vk_find_struct(&vk->features, \ + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_##maj##_##min##_FEATURES); \ + if (!f || !f->feat) { \ + PL_ERR(vk, "Missing device feature: " #feat); \ + return false; \ + } \ + } while (0) + + CHECK_FEATURE(1, 2, hostQueryReset); + CHECK_FEATURE(1, 2, timelineSemaphore); + + #undef CHECK_FEATURE + return true; +} + + +// List of mandatory device-level functions +// +// Note: Also includes VK_EXT_debug_utils functions, even though they aren't +// mandatory, simply because we load that extension in a special way. +static const struct vk_fun vk_dev_funs[] = { + PL_VK_DEV_FUN(AllocateCommandBuffers), + PL_VK_DEV_FUN(AllocateDescriptorSets), + PL_VK_DEV_FUN(AllocateMemory), + PL_VK_DEV_FUN(BeginCommandBuffer), + PL_VK_DEV_FUN(BindBufferMemory), + PL_VK_DEV_FUN(BindImageMemory), + PL_VK_DEV_FUN(CmdBeginDebugUtilsLabelEXT), + PL_VK_DEV_FUN(CmdBeginRenderPass), + PL_VK_DEV_FUN(CmdBindDescriptorSets), + PL_VK_DEV_FUN(CmdBindIndexBuffer), + PL_VK_DEV_FUN(CmdBindPipeline), + PL_VK_DEV_FUN(CmdBindVertexBuffers), + PL_VK_DEV_FUN(CmdBlitImage), + PL_VK_DEV_FUN(CmdClearColorImage), + PL_VK_DEV_FUN(CmdCopyBuffer), + PL_VK_DEV_FUN(CmdCopyBufferToImage), + PL_VK_DEV_FUN(CmdCopyImage), + PL_VK_DEV_FUN(CmdCopyImageToBuffer), + PL_VK_DEV_FUN(CmdDispatch), + PL_VK_DEV_FUN(CmdDraw), + PL_VK_DEV_FUN(CmdDrawIndexed), + PL_VK_DEV_FUN(CmdEndDebugUtilsLabelEXT), + PL_VK_DEV_FUN(CmdEndRenderPass), + PL_VK_DEV_FUN(CmdPipelineBarrier), + PL_VK_DEV_FUN(CmdPushConstants), + PL_VK_DEV_FUN(CmdResetQueryPool), + PL_VK_DEV_FUN(CmdSetScissor), + PL_VK_DEV_FUN(CmdSetViewport), + PL_VK_DEV_FUN(CmdUpdateBuffer), + PL_VK_DEV_FUN(CmdWriteTimestamp), + PL_VK_DEV_FUN(CreateBuffer), + PL_VK_DEV_FUN(CreateBufferView), + PL_VK_DEV_FUN(CreateCommandPool), + PL_VK_DEV_FUN(CreateComputePipelines), + PL_VK_DEV_FUN(CreateDescriptorPool), + PL_VK_DEV_FUN(CreateDescriptorSetLayout), + PL_VK_DEV_FUN(CreateFence), + PL_VK_DEV_FUN(CreateFramebuffer), + PL_VK_DEV_FUN(CreateGraphicsPipelines), + PL_VK_DEV_FUN(CreateImage), + PL_VK_DEV_FUN(CreateImageView), + PL_VK_DEV_FUN(CreatePipelineCache), + PL_VK_DEV_FUN(CreatePipelineLayout), + PL_VK_DEV_FUN(CreateQueryPool), + PL_VK_DEV_FUN(CreateRenderPass), + PL_VK_DEV_FUN(CreateSampler), + PL_VK_DEV_FUN(CreateSemaphore), + PL_VK_DEV_FUN(CreateShaderModule), + PL_VK_DEV_FUN(DestroyBuffer), + PL_VK_DEV_FUN(DestroyBufferView), + PL_VK_DEV_FUN(DestroyCommandPool), + PL_VK_DEV_FUN(DestroyDescriptorPool), + PL_VK_DEV_FUN(DestroyDescriptorSetLayout), + PL_VK_DEV_FUN(DestroyDevice), + PL_VK_DEV_FUN(DestroyFence), + PL_VK_DEV_FUN(DestroyFramebuffer), + PL_VK_DEV_FUN(DestroyImage), + PL_VK_DEV_FUN(DestroyImageView), + PL_VK_DEV_FUN(DestroyInstance), + PL_VK_DEV_FUN(DestroyPipeline), + PL_VK_DEV_FUN(DestroyPipelineCache), + PL_VK_DEV_FUN(DestroyPipelineLayout), + PL_VK_DEV_FUN(DestroyQueryPool), + PL_VK_DEV_FUN(DestroyRenderPass), + PL_VK_DEV_FUN(DestroySampler), + PL_VK_DEV_FUN(DestroySemaphore), + PL_VK_DEV_FUN(DestroyShaderModule), + PL_VK_DEV_FUN(DeviceWaitIdle), + PL_VK_DEV_FUN(EndCommandBuffer), + PL_VK_DEV_FUN(FlushMappedMemoryRanges), + PL_VK_DEV_FUN(FreeCommandBuffers), + PL_VK_DEV_FUN(FreeMemory), + PL_VK_DEV_FUN(GetBufferMemoryRequirements), + PL_VK_DEV_FUN(GetDeviceQueue), + PL_VK_DEV_FUN(GetImageMemoryRequirements2), + PL_VK_DEV_FUN(GetImageSubresourceLayout), + PL_VK_DEV_FUN(GetPipelineCacheData), + PL_VK_DEV_FUN(GetQueryPoolResults), + PL_VK_DEV_FUN(InvalidateMappedMemoryRanges), + PL_VK_DEV_FUN(MapMemory), + PL_VK_DEV_FUN(QueueSubmit), + PL_VK_DEV_FUN(QueueWaitIdle), + PL_VK_DEV_FUN(ResetFences), + PL_VK_DEV_FUN(ResetQueryPool), + PL_VK_DEV_FUN(SetDebugUtilsObjectNameEXT), + PL_VK_DEV_FUN(UpdateDescriptorSets), + PL_VK_DEV_FUN(WaitForFences), + PL_VK_DEV_FUN(WaitSemaphores), +}; + +static void load_vk_fun(struct vk_ctx *vk, const struct vk_fun *fun) +{ + PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset); + + if (fun->device_level) { + *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name); + } else { + *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name); + }; + + if (!*pfn) { + // Some functions get their extension suffix stripped when promoted + // to core. As a very simple work-around to this, try loading the + // function a second time with the reserved suffixes stripped. + static const char *ext_suffixes[] = { "KHR", "EXT" }; + pl_str fun_name = pl_str0(fun->name); + char buf[64]; + + for (int i = 0; i < PL_ARRAY_SIZE(ext_suffixes); i++) { + if (!pl_str_eatend0(&fun_name, ext_suffixes[i])) + continue; + + pl_assert(sizeof(buf) > fun_name.len); + snprintf(buf, sizeof(buf), "%.*s", PL_STR_FMT(fun_name)); + if (fun->device_level) { + *pfn = vk->GetDeviceProcAddr(vk->dev, buf); + } else { + *pfn = vk->GetInstanceProcAddr(vk->inst, buf); + } + return; + } + } +} + +// Private struct for pl_vk_inst +struct priv { + VkDebugUtilsMessengerEXT debug_utils_cb; +}; + +void pl_vk_inst_destroy(pl_vk_inst *inst_ptr) +{ + pl_vk_inst inst = *inst_ptr; + if (!inst) + return; + + struct priv *p = PL_PRIV(inst); + if (p->debug_utils_cb) { + PL_VK_LOAD_FUN(inst->instance, DestroyDebugUtilsMessengerEXT, inst->get_proc_addr); + DestroyDebugUtilsMessengerEXT(inst->instance, p->debug_utils_cb, PL_VK_ALLOC); + } + + PL_VK_LOAD_FUN(inst->instance, DestroyInstance, inst->get_proc_addr); + DestroyInstance(inst->instance, PL_VK_ALLOC); + pl_free_ptr((void **) inst_ptr); +} + +static VkBool32 VKAPI_PTR vk_dbg_utils_cb(VkDebugUtilsMessageSeverityFlagBitsEXT sev, + VkDebugUtilsMessageTypeFlagsEXT msgType, + const VkDebugUtilsMessengerCallbackDataEXT *data, + void *priv) +{ + pl_log log = priv; + + // Ignore errors for messages that we consider false positives + switch (data->messageIdNumber) { + case 0x7cd0911d: // VUID-VkSwapchainCreateInfoKHR-imageExtent-01274 + case 0x8928392f: // UNASSIGNED-BestPractices-NonSuccess-Result + case 0xdc18ad6b: // UNASSIGNED-BestPractices-vkAllocateMemory-small-allocation + case 0xb3d4346b: // UNASSIGNED-BestPractices-vkBindMemory-small-dedicated-allocation + case 0x6cfe18a5: // UNASSIGNED-BestPractices-SemaphoreCount + case 0x48a09f6c: // UNASSIGNED-BestPractices-pipeline-stage-flags + // profile chain expectations + case 0x30f4ac70: // VUID-VkImageCreateInfo-pNext-06811 + return false; + + case 0x5f379b89: // UNASSIGNED-BestPractices-Error-Result + if (strstr(data->pMessage, "VK_ERROR_FORMAT_NOT_SUPPORTED")) + return false; + break; + + case 0xf6a37cfa: // VUID-vkGetImageSubresourceLayout-format-04461 + // Work around https://github.com/KhronosGroup/Vulkan-Docs/issues/2109 + return false; + } + + enum pl_log_level lev; + switch (sev) { + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT: lev = PL_LOG_ERR; break; + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT: lev = PL_LOG_WARN; break; + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT: lev = PL_LOG_DEBUG; break; + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT: lev = PL_LOG_TRACE; break; + default: lev = PL_LOG_INFO; break; + } + + pl_msg(log, lev, "vk %s", data->pMessage); + + for (int i = 0; i < data->queueLabelCount; i++) + pl_msg(log, lev, " during %s", data->pQueueLabels[i].pLabelName); + for (int i = 0; i < data->cmdBufLabelCount; i++) + pl_msg(log, lev, " inside %s", data->pCmdBufLabels[i].pLabelName); + for (int i = 0; i < data->objectCount; i++) { + const VkDebugUtilsObjectNameInfoEXT *obj = &data->pObjects[i]; + pl_msg(log, lev, " using %s: %s (0x%llx)", + vk_obj_type(obj->objectType), + obj->pObjectName ? obj->pObjectName : "anon", + (unsigned long long) obj->objectHandle); + } + + // The return value of this function determines whether the call will + // be explicitly aborted (to prevent GPU errors) or not. In this case, + // we generally want this to be on for the validation errors, but nothing + // else (e.g. performance warnings) + bool is_error = (sev & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) && + (msgType & VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT); + + if (is_error) { + pl_log_stack_trace(log, lev); + pl_debug_abort(); + return true; + } + + return false; +} + +static PFN_vkGetInstanceProcAddr get_proc_addr_fallback(pl_log log, + PFN_vkGetInstanceProcAddr get_proc_addr) +{ + if (get_proc_addr) + return get_proc_addr; + +#ifdef PL_HAVE_VK_PROC_ADDR + return vkGetInstanceProcAddr; +#else + pl_fatal(log, "No `vkGetInstanceProcAddr` function provided, and " + "libplacebo built without linking against this function!"); + return NULL; +#endif +} + +#define PRINTF_VER(ver) \ + (int) VK_API_VERSION_MAJOR(ver), \ + (int) VK_API_VERSION_MINOR(ver), \ + (int) VK_API_VERSION_PATCH(ver) + +pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params) +{ + void *tmp = pl_tmp(NULL); + params = PL_DEF(params, &pl_vk_inst_default_params); + VkInstance inst = NULL; + pl_clock_t start; + + PL_ARRAY(const char *) exts = {0}; + + PFN_vkGetInstanceProcAddr get_addr; + if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr))) + goto error; + + // Query instance version support + uint32_t api_ver = VK_API_VERSION_1_0; + PL_VK_LOAD_FUN(NULL, EnumerateInstanceVersion, get_addr); + if (EnumerateInstanceVersion && EnumerateInstanceVersion(&api_ver) != VK_SUCCESS) + goto error; + + pl_debug(log, "Available instance version: %d.%d.%d", PRINTF_VER(api_ver)); + + if (params->max_api_version) { + api_ver = PL_MIN(api_ver, params->max_api_version); + pl_info(log, "Restricting API version to %d.%d.%d... new version %d.%d.%d", + PRINTF_VER(params->max_api_version), PRINTF_VER(api_ver)); + } + + if (api_ver < PL_VK_MIN_VERSION) { + pl_fatal(log, "Instance API version %d.%d.%d is lower than the minimum " + "required version of %d.%d.%d, cannot proceed!", + PRINTF_VER(api_ver), PRINTF_VER(PL_VK_MIN_VERSION)); + goto error; + } + + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + .pApplicationInfo = &(VkApplicationInfo) { + .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, + .apiVersion = api_ver, + }, + }; + + // Enumerate all supported layers + start = pl_clock_now(); + PL_VK_LOAD_FUN(NULL, EnumerateInstanceLayerProperties, get_addr); + uint32_t num_layers_avail = 0; + EnumerateInstanceLayerProperties(&num_layers_avail, NULL); + VkLayerProperties *layers_avail = pl_calloc_ptr(tmp, num_layers_avail, layers_avail); + EnumerateInstanceLayerProperties(&num_layers_avail, layers_avail); + pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance layers"); + + pl_debug(log, "Available layers:"); + for (int i = 0; i < num_layers_avail; i++) { + pl_debug(log, " %s (v%d.%d.%d)", layers_avail[i].layerName, + PRINTF_VER(layers_avail[i].specVersion)); + } + + PL_ARRAY(const char *) layers = {0}; + + // Sorted by priority + static const char *debug_layers[] = { + "VK_LAYER_KHRONOS_validation", + "VK_LAYER_LUNARG_standard_validation", + }; + + // This layer has to be initialized first, otherwise all sorts of weirdness + // happens (random segfaults, yum) + bool debug = params->debug; + uint32_t debug_layer = 0; // layer idx of debug layer + uint32_t debug_layer_version = 0; + if (debug) { + for (int i = 0; i < PL_ARRAY_SIZE(debug_layers); i++) { + for (int n = 0; n < num_layers_avail; n++) { + if (strcmp(debug_layers[i], layers_avail[n].layerName) != 0) + continue; + + debug_layer = n; + debug_layer_version = layers_avail[n].specVersion; + pl_info(log, "Enabling debug meta layer: %s (v%d.%d.%d)", + debug_layers[i], PRINTF_VER(debug_layer_version)); + PL_ARRAY_APPEND(tmp, layers, debug_layers[i]); + goto debug_layers_done; + } + } + + // No layer found.. + pl_warn(log, "API debugging requested but no debug meta layers present... ignoring"); + debug = false; + } + +debug_layers_done: ; + + for (int i = 0; i < params->num_layers; i++) + PL_ARRAY_APPEND(tmp, layers, params->layers[i]); + + for (int i = 0; i < params->num_opt_layers; i++) { + const char *layer = params->opt_layers[i]; + for (int n = 0; n < num_layers_avail; n++) { + if (strcmp(layer, layers_avail[n].layerName) == 0) { + PL_ARRAY_APPEND(tmp, layers, layer); + break; + } + } + } + + // Enumerate all supported extensions + start = pl_clock_now(); + PL_VK_LOAD_FUN(NULL, EnumerateInstanceExtensionProperties, get_addr); + uint32_t num_exts_avail = 0; + EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, NULL); + VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail); + EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, exts_avail); + + struct { + VkExtensionProperties *exts; + uint32_t num_exts; + } *layer_exts = pl_calloc_ptr(tmp, num_layers_avail, layer_exts); + + // Enumerate extensions from layers + for (int i = 0; i < num_layers_avail; i++) { + VkExtensionProperties **lexts = &layer_exts[i].exts; + uint32_t *num = &layer_exts[i].num_exts; + + EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, NULL); + *lexts = pl_calloc_ptr(tmp, *num, *lexts); + EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, *lexts); + + // Replace all extensions that are already available globally by {0} + for (int j = 0; j < *num; j++) { + for (int k = 0; k < num_exts_avail; k++) { + if (strcmp((*lexts)[j].extensionName, exts_avail[k].extensionName) == 0) + (*lexts)[j] = (VkExtensionProperties) {0}; + } + } + } + + pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance extensions"); + pl_debug(log, "Available instance extensions:"); + for (int i = 0; i < num_exts_avail; i++) + pl_debug(log, " %s", exts_avail[i].extensionName); + for (int i = 0; i < num_layers_avail; i++) { + for (int j = 0; j < layer_exts[i].num_exts; j++) { + if (!layer_exts[i].exts[j].extensionName[0]) + continue; + + pl_debug(log, " %s (via %s)", + layer_exts[i].exts[j].extensionName, + layers_avail[i].layerName); + } + } + + // Add mandatory extensions + PL_ARRAY_APPEND(tmp, exts, VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + + // Add optional extensions + for (int i = 0; i < PL_ARRAY_SIZE(vk_instance_extensions); i++) { + const char *ext = vk_instance_extensions[i]; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + break; + } + } + } + +#ifdef VK_KHR_portability_enumeration + // Required for macOS ( MoltenVK ) compatibility + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); + info.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR; + break; + } + } +#endif + + // Add extra user extensions + for (int i = 0; i < params->num_extensions; i++) { + const char *ext = params->extensions[i]; + PL_ARRAY_APPEND(tmp, exts, ext); + + // Enable any additional layers that are required for this extension + for (int n = 0; n < num_layers_avail; n++) { + for (int j = 0; j < layer_exts[n].num_exts; j++) { + if (!layer_exts[n].exts[j].extensionName[0]) + continue; + if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName); + goto next_user_ext; + } + } + } + +next_user_ext: ; + } + + // Add extra optional user extensions + for (int i = 0; i < params->num_opt_extensions; i++) { + const char *ext = params->opt_extensions[i]; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + goto next_opt_user_ext; + } + } + + for (int n = 0; n < num_layers_avail; n++) { + for (int j = 0; j < layer_exts[n].num_exts; j++) { + if (!layer_exts[n].exts[j].extensionName[0]) + continue; + if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName); + goto next_opt_user_ext; + } + } + } + +next_opt_user_ext: ; + } + + // If debugging is enabled, load the necessary debug utils extension + if (debug) { + const char * const ext = VK_EXT_DEBUG_UTILS_EXTENSION_NAME; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + goto debug_ext_done; + } + } + + for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) { + if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + goto debug_ext_done; + } + } + + // No extension found + pl_warn(log, "API debug layers enabled but no debug report extension " + "found... ignoring. Debug messages may be spilling to " + "stdout/stderr!"); + debug = false; + } + +debug_ext_done: ; + + // Limit this to 1.3.250+ because of bugs in older versions. + if (debug && params->debug_extra && + debug_layer_version >= VK_MAKE_API_VERSION(0, 1, 3, 259)) + { + // Try enabling as many validation features as possible + static const VkValidationFeatureEnableEXT validation_features[] = { + VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT, + VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT, + VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT, + VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT, + }; + + static const VkValidationFeaturesEXT vinfo = { + .sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT, + .pEnabledValidationFeatures = validation_features, + .enabledValidationFeatureCount = PL_ARRAY_SIZE(validation_features), + }; + + const char * const ext = VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + vk_link_struct(&info, &vinfo); + goto debug_extra_ext_done; + } + } + + for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) { + if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + vk_link_struct(&info, &vinfo); + goto debug_extra_ext_done; + } + } + + pl_warn(log, "GPU-assisted validation enabled but not supported by " + "instance, disabling..."); + } + +debug_extra_ext_done: ; + + info.ppEnabledExtensionNames = exts.elem; + info.enabledExtensionCount = exts.num; + info.ppEnabledLayerNames = layers.elem; + info.enabledLayerCount = layers.num; + + pl_info(log, "Creating vulkan instance%s", exts.num ? " with extensions:" : ""); + for (int i = 0; i < exts.num; i++) + pl_info(log, " %s", exts.elem[i]); + + if (layers.num) { + pl_info(log, " and layers:"); + for (int i = 0; i < layers.num; i++) + pl_info(log, " %s", layers.elem[i]); + } + + start = pl_clock_now(); + PL_VK_LOAD_FUN(NULL, CreateInstance, get_addr); + VkResult res = CreateInstance(&info, PL_VK_ALLOC, &inst); + pl_log_cpu_time(log, start, pl_clock_now(), "creating vulkan instance"); + if (res != VK_SUCCESS) { + pl_fatal(log, "Failed creating instance: %s", vk_res_str(res)); + goto error; + } + + struct pl_vk_inst_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct priv); + struct priv *p = PL_PRIV(pl_vk); + *pl_vk = (struct pl_vk_inst_t) { + .instance = inst, + .api_version = api_ver, + .get_proc_addr = get_addr, + .extensions = pl_steal(pl_vk, exts.elem), + .num_extensions = exts.num, + .layers = pl_steal(pl_vk, layers.elem), + .num_layers = layers.num, + }; + + // Set up a debug callback to catch validation messages + if (debug) { + VkDebugUtilsMessengerCreateInfoEXT dinfo = { + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, + .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT, + .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, + .pfnUserCallback = vk_dbg_utils_cb, + .pUserData = (void *) log, + }; + + PL_VK_LOAD_FUN(inst, CreateDebugUtilsMessengerEXT, get_addr); + CreateDebugUtilsMessengerEXT(inst, &dinfo, PL_VK_ALLOC, &p->debug_utils_cb); + } + + pl_free(tmp); + return pl_vk; + +error: + pl_fatal(log, "Failed initializing vulkan instance"); + if (inst) { + PL_VK_LOAD_FUN(inst, DestroyInstance, get_addr); + DestroyInstance(inst, PL_VK_ALLOC); + } + pl_free(tmp); + return NULL; +} + +const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS }; + +void pl_vulkan_destroy(pl_vulkan *pl_vk) +{ + if (!*pl_vk) + return; + + struct vk_ctx *vk = PL_PRIV(*pl_vk); + if (vk->dev) { + if ((*pl_vk)->gpu) { + PL_DEBUG(vk, "Waiting for remaining commands..."); + pl_gpu_finish((*pl_vk)->gpu); + pl_assert(vk->cmds_pending.num == 0); + + pl_gpu_destroy((*pl_vk)->gpu); + } + vk_malloc_destroy(&vk->ma); + for (int i = 0; i < vk->pools.num; i++) + vk_cmdpool_destroy(vk->pools.elem[i]); + + if (!vk->imported) + vk->DestroyDevice(vk->dev, PL_VK_ALLOC); + } + + for (int i = 0; i < vk->queue_locks.num; i++) { + for (int n = 0; n < vk->queue_locks.elem[i].num; n++) + pl_mutex_destroy(&vk->queue_locks.elem[i].elem[n]); + } + + pl_vk_inst_destroy(&vk->internal_instance); + pl_mutex_destroy(&vk->lock); + pl_free_ptr((void **) pl_vk); +} + +static bool supports_surf(pl_log log, VkInstance inst, + PFN_vkGetInstanceProcAddr get_addr, + VkPhysicalDevice physd, VkSurfaceKHR surf) +{ + // Hack for the VK macro's logging to work + struct { pl_log log; } *vk = (void *) &log; + + PL_VK_LOAD_FUN(inst, GetPhysicalDeviceQueueFamilyProperties, get_addr); + PL_VK_LOAD_FUN(inst, GetPhysicalDeviceSurfaceSupportKHR, get_addr); + uint32_t qfnum = 0; + GetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL); + + for (int i = 0; i < qfnum; i++) { + VkBool32 sup = false; + VK(GetPhysicalDeviceSurfaceSupportKHR(physd, i, surf, &sup)); + if (sup) + return true; + } + +error: + return false; +} + +VkPhysicalDevice pl_vulkan_choose_device(pl_log log, + const struct pl_vulkan_device_params *params) +{ + // Hack for the VK macro's logging to work + struct { pl_log log; } *vk = (void *) &log; + PL_INFO(vk, "Probing for vulkan devices:"); + + pl_assert(params->instance); + VkInstance inst = params->instance; + VkPhysicalDevice dev = VK_NULL_HANDLE; + + PFN_vkGetInstanceProcAddr get_addr; + if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr))) + return NULL; + + PL_VK_LOAD_FUN(inst, EnumeratePhysicalDevices, get_addr); + PL_VK_LOAD_FUN(inst, GetPhysicalDeviceProperties2, get_addr); + pl_assert(GetPhysicalDeviceProperties2); + + pl_clock_t start = pl_clock_now(); + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + VK(EnumeratePhysicalDevices(inst, &num, NULL)); + devices = pl_calloc_ptr(NULL, num, devices); + VK(EnumeratePhysicalDevices(inst, &num, devices)); + pl_log_cpu_time(log, start, pl_clock_now(), "enumerating physical devices"); + + static const struct { const char *name; int priority; } types[] = { + [VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU] = {"discrete", 5}, + [VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU] = {"integrated", 4}, + [VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU] = {"virtual", 3}, + [VK_PHYSICAL_DEVICE_TYPE_CPU] = {"software", 2}, + [VK_PHYSICAL_DEVICE_TYPE_OTHER] = {"other", 1}, + }; + + static const uint8_t nil[VK_UUID_SIZE] = {0}; + bool uuid_set = memcmp(params->device_uuid, nil, VK_UUID_SIZE) != 0; + + int best = -1; + for (int i = 0; i < num; i++) { + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + }; + + VkPhysicalDeviceProperties2 prop = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &id_props, + }; + + GetPhysicalDeviceProperties2(devices[i], &prop); + VkPhysicalDeviceType t = prop.properties.deviceType; + const char *dtype = t < PL_ARRAY_SIZE(types) ? types[t].name : "unknown?"; + PL_INFO(vk, " GPU %d: %s v%d.%d.%d (%s)", i, prop.properties.deviceName, + PRINTF_VER(prop.properties.apiVersion), dtype); + PL_INFO(vk, " uuid: %s", PRINT_UUID(id_props.deviceUUID)); + + if (params->surface) { + if (!supports_surf(log, inst, get_addr, devices[i], params->surface)) { + PL_DEBUG(vk, " -> excluding due to lack of surface support"); + continue; + } + } + + if (uuid_set) { + if (memcmp(id_props.deviceUUID, params->device_uuid, VK_UUID_SIZE) == 0) { + dev = devices[i]; + continue; + } else { + PL_DEBUG(vk, " -> excluding due to UUID mismatch"); + continue; + } + } else if (params->device_name && params->device_name[0] != '\0') { + if (strcmp(params->device_name, prop.properties.deviceName) == 0) { + dev = devices[i]; + continue; + } else { + PL_DEBUG(vk, " -> excluding due to name mismatch"); + continue; + } + } + + if (!params->allow_software && t == VK_PHYSICAL_DEVICE_TYPE_CPU) { + PL_DEBUG(vk, " -> excluding due to !params->allow_software"); + continue; + } + + if (prop.properties.apiVersion < PL_VK_MIN_VERSION) { + PL_DEBUG(vk, " -> excluding due to too low API version"); + continue; + } + + int priority = t < PL_ARRAY_SIZE(types) ? types[t].priority : 0; + if (priority > best) { + dev = devices[i]; + best = priority; + } + } + +error: + pl_free(devices); + return dev; +} + +static void lock_queue_internal(void *priv, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = priv; + pl_mutex_lock(&vk->queue_locks.elem[qf].elem[qidx]); +} + +static void unlock_queue_internal(void *priv, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = priv; + pl_mutex_unlock(&vk->queue_locks.elem[qf].elem[qidx]); +} + +static void init_queue_locks(struct vk_ctx *vk, uint32_t qfnum, + const VkQueueFamilyProperties *qfs) +{ + vk->queue_locks.elem = pl_calloc_ptr(vk->alloc, qfnum, vk->queue_locks.elem); + vk->queue_locks.num = qfnum; + for (int i = 0; i < qfnum; i++) { + const uint32_t qnum = qfs[i].queueCount; + vk->queue_locks.elem[i].elem = pl_calloc(vk->alloc, qnum, sizeof(pl_mutex)); + vk->queue_locks.elem[i].num = qnum; + for (int n = 0; n < qnum; n++) + pl_mutex_init(&vk->queue_locks.elem[i].elem[n]); + } + + vk->lock_queue = lock_queue_internal; + vk->unlock_queue = unlock_queue_internal; + vk->queue_ctx = vk; +} + +// Find the most specialized queue supported a combination of flags. In cases +// where there are multiple queue families at the same specialization level, +// this finds the one with the most queues. Returns -1 if no queue was found. +static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags) +{ + int idx = -1; + for (int i = 0; i < qfnum; i++) { + if ((qfs[i].queueFlags & flags) != flags) + continue; + + // QF is more specialized. Since we don't care about other bits like + // SPARSE_BIT, mask the ones we're interestew in + const VkQueueFlags mask = VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_TRANSFER_BIT | + VK_QUEUE_COMPUTE_BIT; + + if (idx < 0 || (qfs[i].queueFlags & mask) < (qfs[idx].queueFlags & mask)) + idx = i; + + // QF has more queues (at the same specialization level) + if (qfs[i].queueFlags == qfs[idx].queueFlags && + qfs[i].queueCount > qfs[idx].queueCount) + idx = i; + } + + return idx; +} + +static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params) +{ + pl_assert(vk->physd); + void *tmp = pl_tmp(NULL); + + // Enumerate the queue families and find suitable families for each task + uint32_t qfnum = 0; + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL); + VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs); + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs); + init_queue_locks(vk, qfnum, qfs); + + PL_DEBUG(vk, "Queue families supported by device:"); + for (int i = 0; i < qfnum; i++) { + PL_DEBUG(vk, " %d: flags 0x%"PRIx32" num %"PRIu32, i, + qfs[i].queueFlags, qfs[i].queueCount); + } + + VkQueueFlagBits gfx_flags = VK_QUEUE_GRAPHICS_BIT; + if (!params->async_compute) + gfx_flags |= VK_QUEUE_COMPUTE_BIT; + + int idx_gfx = find_qf(qfs, qfnum, gfx_flags); + int idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT); + int idx_tf = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT); + if (idx_tf < 0) + idx_tf = idx_comp; + + if (!params->async_compute) + idx_comp = idx_gfx; + if (!params->async_transfer) + idx_tf = idx_gfx; + + PL_DEBUG(vk, "Using graphics queue %d", idx_gfx); + if (idx_tf != idx_gfx) + PL_INFO(vk, "Using async transfer (queue %d)", idx_tf); + if (idx_comp != idx_gfx) + PL_INFO(vk, "Using async compute (queue %d)", idx_comp); + + // Vulkan requires at least one GRAPHICS+COMPUTE queue, so if this fails + // something is horribly wrong. + pl_assert(idx_gfx >= 0 && idx_comp >= 0 && idx_tf >= 0); + + // If needed, ensure we can actually present to the surface using this queue + if (params->surface) { + VkBool32 sup = false; + VK(vk->GetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx, + params->surface, &sup)); + if (!sup) { + PL_FATAL(vk, "Queue family does not support surface presentation!"); + goto error; + } + } + + // Enumerate all supported extensions + pl_clock_t start = pl_clock_now(); + uint32_t num_exts_avail = 0; + VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, NULL)); + VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail); + VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, exts_avail)); + pl_log_cpu_time(vk->log, start, pl_clock_now(), "enumerating device extensions"); + + PL_DEBUG(vk, "Available device extensions:"); + for (int i = 0; i < num_exts_avail; i++) + PL_DEBUG(vk, " %s", exts_avail[i].extensionName); + + // Add all extensions we need + if (params->surface) + PL_ARRAY_APPEND(vk->alloc, vk->exts, VK_KHR_SWAPCHAIN_EXTENSION_NAME); + + // Keep track of all optional function pointers associated with extensions + PL_ARRAY(const struct vk_fun *) ext_funs = {0}; + + // Add all optional device-level extensions extensions + for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) { + const struct vk_ext *ext = &vk_device_extensions[i]; + uint32_t core_ver = vk_ext_promoted_ver(ext->name); + if (core_ver && vk->api_ver >= core_ver) { + // Layer is already implicitly enabled by the API version + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + PL_ARRAY_APPEND(tmp, ext_funs, f); + continue; + } + + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext->name, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(vk->alloc, vk->exts, ext->name); + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + PL_ARRAY_APPEND(tmp, ext_funs, f); + break; + } + } + } + + // Add extra user extensions + for (int i = 0; i < params->num_extensions; i++) + PL_ARRAY_APPEND(vk->alloc, vk->exts, params->extensions[i]); + + // Add optional extra user extensions + for (int i = 0; i < params->num_opt_extensions; i++) { + const char *ext = params->opt_extensions[i]; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(vk->alloc, vk->exts, ext); + break; + } + } + } + + VkPhysicalDeviceFeatures2 features = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR + }; + + vk_features_normalize(tmp, &pl_vulkan_required_features, vk->api_ver, &features); + vk_features_normalize(tmp, &pl_vulkan_recommended_features, vk->api_ver, &features); + vk_features_normalize(tmp, params->features, vk->api_ver, &features); + + // Explicitly clear the features struct before querying feature support + // from the driver. This way, we don't mistakenly mark as supported + // features coming from structs the driver doesn't have support for. + VkPhysicalDeviceFeatures2 *features_sup = vk_chain_memdup(tmp, &features);; + for (VkBaseOutStructure *out = (void *) features_sup; out; out = out->pNext) { + const size_t size = vk_struct_size(out->sType); + memset(&out[1], 0, size - sizeof(out[0])); + } + + vk->GetPhysicalDeviceFeatures2KHR(vk->physd, features_sup); + + // Filter out unsupported features + for (VkBaseOutStructure *f = (VkBaseOutStructure *) &features; f; f = f->pNext) { + const VkBaseInStructure *sup = vk_find_struct(features_sup, f->sType); + VkBool32 *flags = (VkBool32 *) &f[1]; + const VkBool32 *flags_sup = (const VkBool32 *) &sup[1]; + const size_t size = vk_struct_size(f->sType) - sizeof(VkBaseOutStructure); + for (int i = 0; i < size / sizeof(VkBool32); i++) + flags[i] &= flags_sup[i]; + } + + // Construct normalized output chain + vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + vk_features_normalize(vk->alloc, &features, 0, &vk->features); + if (!check_required_features(vk)) { + PL_FATAL(vk, "Vulkan device does not support all required features!"); + goto error; + } + + // Enable all queues at device creation time, to maximize compatibility + // with other API users (e.g. FFmpeg) + PL_ARRAY(VkDeviceQueueCreateInfo) qinfos = {0}; + for (int i = 0; i < qfnum; i++) { + bool use_qf = i == idx_gfx || i == idx_comp || i == idx_tf; + use_qf |= qfs[i].queueFlags & params->extra_queues; + if (!use_qf) + continue; + PL_ARRAY_APPEND(tmp, qinfos, (VkDeviceQueueCreateInfo) { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = i, + .queueCount = qfs[i].queueCount, + .pQueuePriorities = pl_calloc(tmp, qfs[i].queueCount, sizeof(float)), + }); + } + + VkDeviceCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .pNext = &features, + .pQueueCreateInfos = qinfos.elem, + .queueCreateInfoCount = qinfos.num, + .ppEnabledExtensionNames = vk->exts.elem, + .enabledExtensionCount = vk->exts.num, + }; + + PL_INFO(vk, "Creating vulkan device%s", vk->exts.num ? " with extensions:" : ""); + for (int i = 0; i < vk->exts.num; i++) + PL_INFO(vk, " %s", vk->exts.elem[i]); + + start = pl_clock_now(); + VK(vk->CreateDevice(vk->physd, &dinfo, PL_VK_ALLOC, &vk->dev)); + pl_log_cpu_time(vk->log, start, pl_clock_now(), "creating vulkan device"); + + // Load all mandatory device-level functions + for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++) + load_vk_fun(vk, &vk_dev_funs[i]); + + // Load all of the optional functions from the extensions we enabled + for (int i = 0; i < ext_funs.num; i++) + load_vk_fun(vk, ext_funs.elem[i]); + + // Create the command pools for the queues we care about + const uint32_t qmax = PL_DEF(params->queue_count, UINT32_MAX); + for (int i = 0; i < qfnum; i++) { + if (i != idx_gfx && i != idx_tf && i != idx_comp) + continue; // ignore QFs not used internally + + int qnum = qfs[i].queueCount; + if (qmax < qnum) { + PL_DEBUG(vk, "Restricting QF %d from %d queues to %d", i, qnum, qmax); + qnum = qmax; + } + + struct vk_cmdpool *pool = vk_cmdpool_create(vk, i, qnum, qfs[i]); + if (!pool) + goto error; + PL_ARRAY_APPEND(vk->alloc, vk->pools, pool); + + // Update the pool_* pointers based on the corresponding index + const char *qf_name = NULL; + if (i == idx_tf) { + vk->pool_transfer = pool; + qf_name = "transfer"; + } + if (i == idx_comp) { + vk->pool_compute = pool; + qf_name = "compute"; + } + if (i == idx_gfx) { + vk->pool_graphics = pool; + qf_name = "graphics"; + } + + for (int n = 0; n < pool->num_queues; n++) + PL_VK_NAME_HANDLE(QUEUE, pool->queues[n], qf_name); + } + + pl_free(tmp); + return true; + +error: + PL_FATAL(vk, "Failed creating logical device!"); + pl_free(tmp); + vk->failed = true; + return false; +} + +static void lock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = PL_PRIV(pl_vk); + vk->lock_queue(vk->queue_ctx, qf, qidx); +} + +static void unlock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = PL_PRIV(pl_vk); + vk->unlock_queue(vk->queue_ctx, qf, qidx); +} + +static bool finalize_context(struct pl_vulkan_t *pl_vk, int max_glsl_version) +{ + struct vk_ctx *vk = PL_PRIV(pl_vk); + + pl_assert(vk->pool_graphics); + pl_assert(vk->pool_compute); + pl_assert(vk->pool_transfer); + + vk->ma = vk_malloc_create(vk); + if (!vk->ma) + return false; + + pl_vk->gpu = pl_gpu_create_vk(vk); + if (!pl_vk->gpu) + return false; + + // Blacklist / restrict features + if (max_glsl_version) { + struct pl_glsl_version *glsl = (struct pl_glsl_version *) &pl_vk->gpu->glsl; + glsl->version = PL_MIN(glsl->version, max_glsl_version); + glsl->version = PL_MAX(glsl->version, 140); // required for GL_KHR_vulkan_glsl + PL_INFO(vk, "Restricting GLSL version to %d... new version is %d", + max_glsl_version, glsl->version); + } + + // Expose the resulting vulkan objects + pl_vk->instance = vk->inst; + pl_vk->phys_device = vk->physd; + pl_vk->device = vk->dev; + pl_vk->get_proc_addr = vk->GetInstanceProcAddr; + pl_vk->api_version = vk->api_ver; + pl_vk->extensions = vk->exts.elem; + pl_vk->num_extensions = vk->exts.num; + pl_vk->features = &vk->features; + pl_vk->num_queues = vk->pools.num; + pl_vk->queues = pl_calloc_ptr(vk->alloc, vk->pools.num, pl_vk->queues); + pl_vk->lock_queue = lock_queue; + pl_vk->unlock_queue = unlock_queue; + + for (int i = 0; i < vk->pools.num; i++) { + struct pl_vulkan_queue *queues = (struct pl_vulkan_queue *) pl_vk->queues; + queues[i] = (struct pl_vulkan_queue) { + .index = vk->pools.elem[i]->qf, + .count = vk->pools.elem[i]->num_queues, + }; + + if (vk->pools.elem[i] == vk->pool_graphics) + pl_vk->queue_graphics = queues[i]; + if (vk->pools.elem[i] == vk->pool_compute) + pl_vk->queue_compute = queues[i]; + if (vk->pools.elem[i] == vk->pool_transfer) + pl_vk->queue_transfer = queues[i]; + } + + pl_assert(vk->lock_queue); + pl_assert(vk->unlock_queue); + return true; +} + +pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params) +{ + params = PL_DEF(params, &pl_vulkan_default_params); + struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx); + struct vk_ctx *vk = PL_PRIV(pl_vk); + *vk = (struct vk_ctx) { + .vulkan = pl_vk, + .alloc = pl_vk, + .log = log, + .inst = params->instance, + .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr), + }; + + pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE); + if (!vk->GetInstanceProcAddr) + goto error; + + if (!vk->inst) { + pl_assert(!params->surface); + pl_assert(!params->device); + PL_DEBUG(vk, "No VkInstance provided, creating one..."); + + // Mirror the instance params here to set `get_proc_addr` correctly + struct pl_vk_inst_params iparams; + iparams = *PL_DEF(params->instance_params, &pl_vk_inst_default_params); + iparams.get_proc_addr = params->get_proc_addr; + vk->internal_instance = pl_vk_inst_create(log, &iparams); + if (!vk->internal_instance) + goto error; + vk->inst = vk->internal_instance->instance; + } + + // Directly load all mandatory instance-level function pointers, since + // these will be required for all further device creation logic + for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++) + load_vk_fun(vk, &vk_inst_funs[i]); + + // Choose the physical device + if (params->device) { + PL_DEBUG(vk, "Using specified VkPhysicalDevice"); + vk->physd = params->device; + } else { + struct pl_vulkan_device_params dparams = { + .instance = vk->inst, + .get_proc_addr = params->get_proc_addr, + .surface = params->surface, + .device_name = params->device_name, + .allow_software = params->allow_software, + }; + memcpy(dparams.device_uuid, params->device_uuid, VK_UUID_SIZE); + + vk->physd = pl_vulkan_choose_device(log, &dparams); + if (!vk->physd) { + PL_FATAL(vk, "Found no suitable device, giving up."); + goto error; + } + } + + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + }; + + VkPhysicalDeviceProperties2KHR prop = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &id_props, + }; + + vk->GetPhysicalDeviceProperties2(vk->physd, &prop); + vk->props = prop.properties; + + PL_INFO(vk, "Vulkan device properties:"); + PL_INFO(vk, " Device Name: %s", prop.properties.deviceName); + PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID, + prop.properties.deviceID); + PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID)); + PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion); + PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion)); + + // Needed by device_init + vk->api_ver = prop.properties.apiVersion; + if (params->max_api_version) { + vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version); + PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d", + PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver)); + } + + if (vk->api_ver < PL_VK_MIN_VERSION) { + PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum " + "required version of %d.%d.%d, cannot proceed!", + PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION)); + goto error; + } + + // Finally, initialize the logical device and the rest of the vk_ctx + if (!device_init(vk, params)) + goto error; + + if (!finalize_context(pl_vk, params->max_glsl_version)) + goto error; + + return pl_vk; + +error: + PL_FATAL(vk, "Failed initializing vulkan device"); + pl_vulkan_destroy((pl_vulkan *) &pl_vk); + return NULL; +} + +pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params) +{ + void *tmp = pl_tmp(NULL); + + struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx); + struct vk_ctx *vk = PL_PRIV(pl_vk); + *vk = (struct vk_ctx) { + .vulkan = pl_vk, + .alloc = pl_vk, + .log = log, + .imported = true, + .inst = params->instance, + .physd = params->phys_device, + .dev = params->device, + .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr), + .lock_queue = params->lock_queue, + .unlock_queue = params->unlock_queue, + .queue_ctx = params->queue_ctx, + }; + + pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE); + if (!vk->GetInstanceProcAddr) + goto error; + + for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++) + load_vk_fun(vk, &vk_inst_funs[i]); + + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + }; + + VkPhysicalDeviceProperties2KHR prop = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &id_props, + }; + + pl_assert(vk->GetPhysicalDeviceProperties2); + vk->GetPhysicalDeviceProperties2(vk->physd, &prop); + vk->props = prop.properties; + + PL_INFO(vk, "Imported vulkan device properties:"); + PL_INFO(vk, " Device Name: %s", prop.properties.deviceName); + PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID, + prop.properties.deviceID); + PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID)); + PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion); + PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion)); + + vk->api_ver = prop.properties.apiVersion; + if (params->max_api_version) { + vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version); + PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d", + PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver)); + } + + if (vk->api_ver < PL_VK_MIN_VERSION) { + PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum " + "required version of %d.%d.%d, cannot proceed!", + PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION)); + goto error; + } + + vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + vk_features_normalize(vk->alloc, params->features, 0, &vk->features); + if (!check_required_features(vk)) { + PL_FATAL(vk, "Imported Vulkan device was not created with all required " + "features!"); + goto error; + } + + // Load all mandatory device-level functions + for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++) + load_vk_fun(vk, &vk_dev_funs[i]); + + // Load all of the optional functions from the extensions enabled + for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) { + const struct vk_ext *ext = &vk_device_extensions[i]; + uint32_t core_ver = vk_ext_promoted_ver(ext->name); + if (core_ver && vk->api_ver >= core_ver) { + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + load_vk_fun(vk, f); + continue; + } + for (int n = 0; n < params->num_extensions; n++) { + if (strcmp(ext->name, params->extensions[n]) == 0) { + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + load_vk_fun(vk, f); + break; + } + } + } + + uint32_t qfnum = 0; + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL); + VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs); + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs); + if (!params->lock_queue) + init_queue_locks(vk, qfnum, qfs); + + // Create the command pools for each unique qf that exists + struct { + const struct pl_vulkan_queue *info; + struct vk_cmdpool **pool; + VkQueueFlagBits flags; // *any* of these flags provide the cap + } qinfos[] = { + { + .info = ¶ms->queue_graphics, + .pool = &vk->pool_graphics, + .flags = VK_QUEUE_GRAPHICS_BIT, + }, { + .info = ¶ms->queue_compute, + .pool = &vk->pool_compute, + .flags = VK_QUEUE_COMPUTE_BIT, + }, { + .info = ¶ms->queue_transfer, + .pool = &vk->pool_transfer, + .flags = VK_QUEUE_TRANSFER_BIT | + VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_COMPUTE_BIT, + } + }; + + for (int i = 0; i < PL_ARRAY_SIZE(qinfos); i++) { + int qf = qinfos[i].info->index; + struct vk_cmdpool **pool = qinfos[i].pool; + if (!qinfos[i].info->count) + continue; + + // API sanity check + pl_assert(qfs[qf].queueFlags & qinfos[i].flags); + + // See if we already created a pool for this queue family + for (int j = 0; j < i; j++) { + if (qinfos[j].info->count && qinfos[j].info->index == qf) { + *pool = *qinfos[j].pool; + goto next_qf; + } + } + + *pool = vk_cmdpool_create(vk, qf, qinfos[i].info->count, qfs[qf]); + if (!*pool) + goto error; + PL_ARRAY_APPEND(vk->alloc, vk->pools, *pool); + + // Pre-emptively set "lower priority" pools as well + for (int j = i+1; j < PL_ARRAY_SIZE(qinfos); j++) { + if (qfs[qf].queueFlags & qinfos[j].flags) + *qinfos[j].pool = *pool; + } + +next_qf: ; + } + + if (!vk->pool_graphics) { + PL_ERR(vk, "No valid queues provided?"); + goto error; + } + + if (!finalize_context(pl_vk, params->max_glsl_version)) + goto error; + + pl_free(tmp); + return pl_vk; + +error: + PL_FATAL(vk, "Failed importing vulkan device"); + pl_vulkan_destroy((pl_vulkan *) &pl_vk); + pl_free(tmp); + return NULL; +} diff --git a/src/vulkan/formats.c b/src/vulkan/formats.c new file mode 100644 index 0000000..f0eb0fb --- /dev/null +++ b/src/vulkan/formats.c @@ -0,0 +1,616 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "formats.h" + +#define FMT(_name, num, size, ftype, bits, idx) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_##ftype, \ + .num_components = num, \ + .component_depth = bits, \ + .internal_size = size, \ + .opaque = false, \ + .texel_size = size, \ + .texel_align = size, \ + .host_bits = bits, \ + .sample_order = idx, \ + } + +#define IDX(...) {__VA_ARGS__} +#define BITS(...) {__VA_ARGS__} + +#define REGFMT(name, num, bits, type) \ + FMT(name, num, (num) * (bits) / 8, type, \ + BITS(bits, bits, bits, bits), \ + IDX(0, 1, 2, 3)) + +#define EMUFMT(_name, in, en, ib, eb, ftype) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_##ftype, \ + .num_components = en, \ + .component_depth = BITS(ib, ib, ib, ib),\ + .internal_size = (in) * (ib) / 8, \ + .opaque = false, \ + .emulated = true, \ + .texel_size = (en) * (eb) / 8, \ + .texel_align = (eb) / 8, \ + .host_bits = BITS(eb, eb, eb, eb),\ + .sample_order = IDX(0, 1, 2, 3), \ + } + +#define PACKED16FMT(_name, num, b) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_UNORM, \ + .num_components = num, \ + .component_depth = BITS(b, b, b, b), \ + .internal_size = (num) * 2, \ + .texel_size = (num) * 2, \ + .texel_align = (num) * 2, \ + .host_bits = BITS(16, 16, 16, 16),\ + .sample_order = IDX(0, 1, 2, 3), \ + } + +#define PLANARFMT(_name, planes, size, bits) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_UNORM, \ + .num_planes = planes, \ + .num_components = 3, \ + .component_depth = {bits, bits, bits}, \ + .internal_size = size, \ + .opaque = true, \ + } + +static const struct vk_format rgb8e = { + .tfmt = VK_FORMAT_R8G8B8A8_UNORM, + .bfmt = VK_FORMAT_R8G8B8_UNORM, + .icomps = 4, + .fmt = EMUFMT("rgb8", 4, 3, 8, 8, UNORM), +}; + +static const struct vk_format rgb16e = { + .tfmt = VK_FORMAT_R16G16B16A16_UNORM, + .bfmt = VK_FORMAT_R16G16B16_UNORM, + .icomps = 4, + .fmt = EMUFMT("rgb16", 4, 3, 16, 16, UNORM), +}; + +static const struct vk_format vk_formats[] = { + // Regular, byte-aligned integer formats + {VK_FORMAT_R8_UNORM, REGFMT("r8", 1, 8, UNORM)}, + {VK_FORMAT_R8G8_UNORM, REGFMT("rg8", 2, 8, UNORM)}, + {VK_FORMAT_R8G8B8_UNORM, REGFMT("rgb8", 3, 8, UNORM), .emufmt = &rgb8e}, + {VK_FORMAT_R8G8B8A8_UNORM, REGFMT("rgba8", 4, 8, UNORM)}, + {VK_FORMAT_R16_UNORM, REGFMT("r16", 1, 16, UNORM)}, + {VK_FORMAT_R16G16_UNORM, REGFMT("rg16", 2, 16, UNORM)}, + {VK_FORMAT_R16G16B16_UNORM, REGFMT("rgb16", 3, 16, UNORM), .emufmt = &rgb16e}, + {VK_FORMAT_R16G16B16A16_UNORM, REGFMT("rgba16", 4, 16, UNORM)}, + + {VK_FORMAT_R8_SNORM, REGFMT("r8s", 1, 8, SNORM)}, + {VK_FORMAT_R8G8_SNORM, REGFMT("rg8s", 2, 8, SNORM)}, + {VK_FORMAT_R8G8B8_SNORM, REGFMT("rgb8s", 3, 8, SNORM)}, + {VK_FORMAT_R8G8B8A8_SNORM, REGFMT("rgba8s", 4, 8, SNORM)}, + {VK_FORMAT_R16_SNORM, REGFMT("r16s", 1, 16, SNORM)}, + {VK_FORMAT_R16G16_SNORM, REGFMT("rg16s", 2, 16, SNORM)}, + {VK_FORMAT_R16G16B16_SNORM, REGFMT("rgb16s", 3, 16, SNORM)}, + {VK_FORMAT_R16G16B16A16_SNORM, REGFMT("rgba16s", 4, 16, SNORM)}, + + // Float formats (native formats: hf = half float, df = double float) + {VK_FORMAT_R16_SFLOAT, REGFMT("r16hf", 1, 16, FLOAT)}, + {VK_FORMAT_R16G16_SFLOAT, REGFMT("rg16hf", 2, 16, FLOAT)}, + {VK_FORMAT_R16G16B16_SFLOAT, REGFMT("rgb16hf", 3, 16, FLOAT)}, + {VK_FORMAT_R16G16B16A16_SFLOAT, REGFMT("rgba16hf", 4, 16, FLOAT)}, + {VK_FORMAT_R32_SFLOAT, REGFMT("r32f", 1, 32, FLOAT)}, + {VK_FORMAT_R32G32_SFLOAT, REGFMT("rg32f", 2, 32, FLOAT)}, + {VK_FORMAT_R32G32B32_SFLOAT, REGFMT("rgb32f", 3, 32, FLOAT)}, + {VK_FORMAT_R32G32B32A32_SFLOAT, REGFMT("rgba32f", 4, 32, FLOAT)}, + + // Float formats (emulated upload/download) + {VK_FORMAT_R16_SFLOAT, EMUFMT("r16f", 1, 1, 16, 32, FLOAT)}, + {VK_FORMAT_R16G16_SFLOAT, EMUFMT("rg16f", 2, 2, 16, 32, FLOAT)}, + {VK_FORMAT_R16G16B16_SFLOAT, EMUFMT("rgb16f", 3, 3, 16, 32, FLOAT)}, + {VK_FORMAT_R16G16B16A16_SFLOAT, EMUFMT("rgba16f", 4, 4, 16, 32, FLOAT)}, + + // Integer-sampled formats + {VK_FORMAT_R8_UINT, REGFMT("r8u", 1, 8, UINT)}, + {VK_FORMAT_R8G8_UINT, REGFMT("rg8u", 2, 8, UINT)}, + {VK_FORMAT_R8G8B8_UINT, REGFMT("rgb8u", 3, 8, UINT)}, + {VK_FORMAT_R8G8B8A8_UINT, REGFMT("rgba8u", 4, 8, UINT)}, + {VK_FORMAT_R16_UINT, REGFMT("r16u", 1, 16, UINT)}, + {VK_FORMAT_R16G16_UINT, REGFMT("rg16u", 2, 16, UINT)}, + {VK_FORMAT_R16G16B16_UINT, REGFMT("rgb16u", 3, 16, UINT)}, + {VK_FORMAT_R16G16B16A16_UINT, REGFMT("rgba16u", 4, 16, UINT)}, + {VK_FORMAT_R32_UINT, REGFMT("r32u", 1, 32, UINT)}, + {VK_FORMAT_R32G32_UINT, REGFMT("rg32u", 2, 32, UINT)}, + {VK_FORMAT_R32G32B32_UINT, REGFMT("rgb32u", 3, 32, UINT)}, + {VK_FORMAT_R32G32B32A32_UINT, REGFMT("rgba32u", 4, 32, UINT)}, + + {VK_FORMAT_R8_SINT, REGFMT("r8i", 1, 8, SINT)}, + {VK_FORMAT_R8G8_SINT, REGFMT("rg8i", 2, 8, SINT)}, + {VK_FORMAT_R8G8B8_SINT, REGFMT("rgb8i", 3, 8, SINT)}, + {VK_FORMAT_R8G8B8A8_SINT, REGFMT("rgba8i", 4, 8, SINT)}, + {VK_FORMAT_R16_SINT, REGFMT("r16i", 1, 16, SINT)}, + {VK_FORMAT_R16G16_SINT, REGFMT("rg16i", 2, 16, SINT)}, + {VK_FORMAT_R16G16B16_SINT, REGFMT("rgb16i", 3, 16, SINT)}, + {VK_FORMAT_R16G16B16A16_SINT, REGFMT("rgba16i", 4, 16, SINT)}, + {VK_FORMAT_R32_SINT, REGFMT("r32i", 1, 32, SINT)}, + {VK_FORMAT_R32G32_SINT, REGFMT("rg32i", 2, 32, SINT)}, + {VK_FORMAT_R32G32B32_SINT, REGFMT("rgb32i", 3, 32, SINT)}, + {VK_FORMAT_R32G32B32A32_SINT, REGFMT("rgba32i", 4, 32, SINT)}, + + // "Swapped" component order formats + {VK_FORMAT_B8G8R8_UNORM, FMT("bgr8", 3, 3, UNORM, BITS(8, 8, 8), IDX(2, 1, 0))}, + {VK_FORMAT_B8G8R8A8_UNORM, FMT("bgra8", 4, 4, UNORM, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))}, + + {VK_FORMAT_B8G8R8_UINT, FMT("bgr8u", 3, 3, UINT, BITS(8, 8, 8), IDX(2, 1, 0))}, + {VK_FORMAT_B8G8R8A8_UINT, FMT("bgra8u", 4, 4, UINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))}, + + {VK_FORMAT_B8G8R8_SINT, FMT("bgr8i", 3, 3, SINT, BITS(8, 8, 8), IDX(2, 1, 0))}, + {VK_FORMAT_B8G8R8A8_SINT, FMT("bgra8i", 4, 4, SINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))}, + + // "Packed" integer formats + // + // Note: These have the component order reversed from what the vulkan name + // implies, because we order our IDX from LSB to MSB (consistent with the + // usual ordering from lowest byte to highest byte, on little endian + // platforms), but Vulkan names them from MSB to LSB. + {VK_FORMAT_R4G4_UNORM_PACK8, FMT("gr4", 2, 1, UNORM, BITS(4, 4), IDX(1, 0))}, + {VK_FORMAT_B4G4R4A4_UNORM_PACK16, FMT("argb4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 0, 1, 2))}, + {VK_FORMAT_R4G4B4A4_UNORM_PACK16, FMT("abgr4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 2, 1, 0))}, + + {VK_FORMAT_R5G6B5_UNORM_PACK16, FMT("bgr565", 3, 2, UNORM, BITS(5, 6, 5), IDX(2, 1, 0))}, + {VK_FORMAT_B5G6R5_UNORM_PACK16, FMT("rgb565", 3, 2, UNORM, BITS(5, 6, 5), IDX(0, 1, 2))}, + + {VK_FORMAT_R5G5B5A1_UNORM_PACK16, FMT("a1bgr5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 2, 1, 0))}, + {VK_FORMAT_B5G5R5A1_UNORM_PACK16, FMT("a1rgb5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 0, 1, 2))}, + {VK_FORMAT_A1R5G5B5_UNORM_PACK16, FMT("bgr5a1", 4, 2, UNORM, BITS(5, 5, 5, 1), IDX(2, 1, 0, 3))}, + + {VK_FORMAT_A2B10G10R10_UNORM_PACK32, FMT("rgb10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_UNORM_PACK32, FMT("bgr10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + {VK_FORMAT_A2B10G10R10_SNORM_PACK32, FMT("rgb10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_SNORM_PACK32, FMT("bgr10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + {VK_FORMAT_A2B10G10R10_UINT_PACK32, FMT("rgb10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_UINT_PACK32, FMT("bgr10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + {VK_FORMAT_A2B10G10R10_SINT_PACK32, FMT("rgb10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_SINT_PACK32, FMT("bgr10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + + + // Packed 16 bit formats + {VK_FORMAT_R10X6_UNORM_PACK16, PACKED16FMT("rx10", 1, 10)}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, PACKED16FMT("rxgx10", 2, 10)}, + {VK_FORMAT_R12X4_UNORM_PACK16, PACKED16FMT("rx12", 1, 12)}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, PACKED16FMT("rxgx12", 2, 12)}, + + // FIXME: enabling these requires VK_EXT_rgba10x6_formats or equivalent + // {VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16, PACKED16FMT("rxgxbxax10", 4, 10)}, + // {VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16, PACKED16FMT("rxgxbxax12", 4, 12)}, + + // Planar formats + {VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, PLANARFMT("g8_b8_r8_420", 3, 12, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1}, + {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, PLANARFMT("g8_b8_r8_422", 3, 16, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM, .sx = 1}, + {VK_FORMAT_R8_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, PLANARFMT("g8_b8_r8_444", 3, 24, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM}, + }, + }, + + {VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, PLANARFMT("g16_b16_r16_420", 3, 24, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1}, + {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, PLANARFMT("g16_b16_r16_422", 3, 32, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM, .sx = 1}, + {VK_FORMAT_R16_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, PLANARFMT("g16_b16_r16_444", 3, 48, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM}, + }, + }, + + {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_420", 3, 24, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_422", 3, 32, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_444", 3, 48, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16}, + }, + }, + + {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_420", 3, 24, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_422", 3, 32, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_444", 3, 48, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16}, + }, + }, + + {VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, PLANARFMT("g8_br8_420", 2, 12, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8G8_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, PLANARFMT("g8_br8_422", 2, 16, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8G8_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G8_B8R8_2PLANE_444_UNORM, PLANARFMT("g8_br8_444", 2, 24, 8), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8G8_UNORM}, + }, + }, + + {VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, PLANARFMT("g16_br16_420", 2, 24, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16G16_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, PLANARFMT("g16_br16_422", 2, 32, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16G16_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, PLANARFMT("g16_br16_444", 2, 48, 16), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16G16_UNORM}, + }, + }, + + {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_420", 2, 24, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_422", 2, 32, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_444", 2, 48, 10), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16}, + }, + }, + + {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_420", 2, 24, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_422", 2, 32, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_444", 2, 48, 12), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16}, + }, + }, + + {0} +}; + +#undef BITS +#undef IDX +#undef REGFMT +#undef FMT + +void vk_setup_formats(struct pl_gpu_t *gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + PL_ARRAY(pl_fmt) formats = {0}; + + // Texture format emulation requires at least support for texel buffers + bool has_emu = gpu->glsl.compute && gpu->limits.max_buffer_texels; + + for (const struct vk_format *pvk_fmt = vk_formats; pvk_fmt->tfmt; pvk_fmt++) { + const struct vk_format *vk_fmt = pvk_fmt; + + // Skip formats that require a too new version of Vulkan + if (vk_fmt->min_ver > vk->api_ver) + continue; + + // Skip formats with innately emulated representation if unsupported + if (vk_fmt->fmt.emulated && !has_emu) + continue; + + // Suppress some errors/warnings spit out by the format probing code + pl_log_level_cap(vk->log, PL_LOG_INFO); + + bool has_drm_mods = vk->GetImageDrmFormatModifierPropertiesEXT; + VkDrmFormatModifierPropertiesEXT modifiers[16] = {0}; + VkDrmFormatModifierPropertiesListEXT drm_props = { + .sType = VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT, + .drmFormatModifierCount = PL_ARRAY_SIZE(modifiers), + .pDrmFormatModifierProperties = modifiers, + }; + + VkFormatProperties2KHR prop2 = { + .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2, + .pNext = has_drm_mods ? &drm_props : NULL, + }; + + vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2); + + // If wholly unsupported, try falling back to the emulation formats + // for texture operations + VkFormatProperties *prop = &prop2.formatProperties; + while (has_emu && !prop->optimalTilingFeatures && vk_fmt->emufmt) { + vk_fmt = vk_fmt->emufmt; + vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2); + } + + VkFormatFeatureFlags texflags = prop->optimalTilingFeatures; + VkFormatFeatureFlags bufflags = prop->bufferFeatures; + if (vk_fmt->fmt.emulated) { + // Emulated formats might have a different buffer representation + // than their texture representation. If they don't, assume their + // buffer representation is nonsensical (e.g. r16f) + if (vk_fmt->bfmt) { + vk->GetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->bfmt, prop); + bufflags = prop->bufferFeatures; + } else { + bufflags = 0; + } + } else if (vk_fmt->fmt.num_planes) { + // Planar textures cannot be used directly + texflags = bufflags = 0; + } + + pl_log_level_cap(vk->log, PL_LOG_NONE); + + struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct pl_fmt_vk); + struct pl_fmt_vk *fmtp = PL_PRIV(fmt); + *fmt = vk_fmt->fmt; + *fmtp = (struct pl_fmt_vk) { + .vk_fmt = vk_fmt + }; + + // Always set the signature to the actual texture format, so we can use + // it to guarantee renderpass compatibility. + fmt->signature = (uint64_t) vk_fmt->tfmt; + + // For sanity, clear the superfluous fields + for (int i = fmt->num_components; i < 4; i++) { + fmt->component_depth[i] = 0; + fmt->sample_order[i] = 0; + fmt->host_bits[i] = 0; + } + + // We can set this universally + fmt->fourcc = pl_fmt_fourcc(fmt); + + if (has_drm_mods) { + + if (drm_props.drmFormatModifierCount == PL_ARRAY_SIZE(modifiers)) { + PL_WARN(gpu, "DRM modifier list for format %s possibly truncated", + fmt->name); + } + + // Query the list of supported DRM modifiers from the driver + PL_ARRAY(uint64_t) modlist = {0}; + for (int i = 0; i < drm_props.drmFormatModifierCount; i++) { + if (modifiers[i].drmFormatModifierPlaneCount > 1) { + PL_TRACE(gpu, "Ignoring format modifier %s of " + "format %s because its plane count %d > 1", + PRINT_DRM_MOD(modifiers[i].drmFormatModifier), + fmt->name, modifiers[i].drmFormatModifierPlaneCount); + continue; + } + + // Only warn about texture format features relevant to us + const VkFormatFeatureFlags flag_mask = + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT | + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT | + VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT | + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_BLIT_DST_BIT; + + + VkFormatFeatureFlags flags = modifiers[i].drmFormatModifierTilingFeatures; + if ((flags & flag_mask) != (texflags & flag_mask)) { + PL_DEBUG(gpu, "DRM format modifier %s of format %s " + "supports fewer caps (0x%"PRIx32") than optimal tiling " + "(0x%"PRIx32"), may result in limited capability!", + PRINT_DRM_MOD(modifiers[i].drmFormatModifier), + fmt->name, flags, texflags); + } + + PL_ARRAY_APPEND(fmt, modlist, modifiers[i].drmFormatModifier); + } + + fmt->num_modifiers = modlist.num; + fmt->modifiers = modlist.elem; + + } else if (gpu->export_caps.tex & PL_HANDLE_DMA_BUF) { + + // Hard-code a list of static mods that we're likely to support + static const uint64_t static_mods[2] = { + DRM_FORMAT_MOD_INVALID, + DRM_FORMAT_MOD_LINEAR, + }; + + fmt->num_modifiers = PL_ARRAY_SIZE(static_mods); + fmt->modifiers = static_mods; + + } + + struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bufbits[] = { + {VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT, PL_FMT_CAP_VERTEX}, + {VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_UNIFORM}, + {VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_STORAGE}, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(bufbits); i++) { + if ((bufflags & bufbits[i].flags) == bufbits[i].flags) + fmt->caps |= bufbits[i].caps; + } + + if (fmt->caps) { + fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, "")); + pl_assert(fmt->glsl_type); + } + + struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bits[] = { + {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT, PL_FMT_CAP_BLENDABLE}, + {VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, PL_FMT_CAP_LINEAR}, + {VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT, PL_FMT_CAP_SAMPLEABLE}, + {VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT, PL_FMT_CAP_STORABLE}, + {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT, PL_FMT_CAP_RENDERABLE}, + + // We don't distinguish between the two blit modes for pl_fmt_caps + {VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT, + PL_FMT_CAP_BLITTABLE}, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(bits); i++) { + if ((texflags & bits[i].flags) == bits[i].flags) + fmt->caps |= bits[i].caps; + } + + // For blit emulation via compute shaders + if (!(fmt->caps & PL_FMT_CAP_BLITTABLE) && (fmt->caps & PL_FMT_CAP_STORABLE)) { + fmt->caps |= PL_FMT_CAP_BLITTABLE; + fmtp->blit_emulated = true; + } + + // This is technically supported for all textures, but the semantics + // of pl_gpu require it only be listed for non-opaque ones + if (!fmt->opaque) + fmt->caps |= PL_FMT_CAP_HOST_READABLE; + + // Vulkan requires a minimum GLSL version that supports textureGather() + if (fmt->caps & PL_FMT_CAP_SAMPLEABLE) + fmt->gatherable = true; + + // Disable implied capabilities where the dependencies are unavailable + enum pl_fmt_caps storable = PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE; + if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE)) + fmt->caps &= ~PL_FMT_CAP_LINEAR; + if (!gpu->glsl.compute) + fmt->caps &= ~storable; + + bool has_nofmt = vk->features.features.shaderStorageImageReadWithoutFormat && + vk->features.features.shaderStorageImageWriteWithoutFormat; + + if (fmt->caps & storable) { + int real_comps = PL_DEF(vk_fmt->icomps, fmt->num_components); + fmt->glsl_format = pl_fmt_glsl_format(fmt, real_comps); + if (!fmt->glsl_format && !has_nofmt) { + PL_DEBUG(gpu, "Storable format '%s' has no matching GLSL " + "format qualifier but read/write without format " + "is not supported.. disabling", fmt->name); + fmt->caps &= ~storable; + } + } + + if (fmt->caps & storable) + fmt->caps |= PL_FMT_CAP_READWRITE; + + // Pick sub-plane formats for planar formats + for (int n = 0; n < fmt->num_planes; n++) { + for (int i = 0; i < formats.num; i++) { + if (formats.elem[i]->signature == vk_fmt->pfmt[n].fmt) { + fmt->planes[n].format = formats.elem[i]; + fmt->planes[n].shift_x = vk_fmt->pfmt[n].sx; + fmt->planes[n].shift_y = vk_fmt->pfmt[n].sy; + break; + } + } + + pl_assert(fmt->planes[n].format); + } + + PL_ARRAY_APPEND(gpu, formats, fmt); + } + + gpu->formats = formats.elem; + gpu->num_formats = formats.num; +} diff --git a/src/vulkan/formats.h b/src/vulkan/formats.h new file mode 100644 index 0000000..b1408fd --- /dev/null +++ b/src/vulkan/formats.h @@ -0,0 +1,34 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" +#include "gpu.h" + +struct vk_format { + VkFormat tfmt; // internal vulkan format enum (textures) + struct pl_fmt_t fmt;// pl_fmt template (features will be auto-detected) + int icomps; // internal component count (or 0 to infer from `fmt`) + VkFormat bfmt; // vulkan format for use as buffers (or 0 to use `tfmt`) + const struct vk_format *emufmt; // alternate format for emulation + uint32_t min_ver; // minimum vulkan API version for this format to exist + struct { VkFormat fmt; int sx, sy; } pfmt[4]; // plane formats (for planar textures) +}; + +// Add all supported formats to the `pl_gpu` format list +void vk_setup_formats(struct pl_gpu_t *gpu); diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c new file mode 100644 index 0000000..69aca67 --- /dev/null +++ b/src/vulkan/gpu.c @@ -0,0 +1,924 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "formats.h" +#include "glsl/spirv.h" + +#ifdef PL_HAVE_UNIX +#include <unistd.h> +#endif + +// Gives us enough queries for 8 results +#define QUERY_POOL_SIZE 16 + +struct pl_timer_t { + VkQueryPool qpool; // even=start, odd=stop + int index_write; // next index to write to + int index_read; // next index to read from + uint64_t pending; // bitmask of queries that are still running +}; + +static inline uint64_t timer_bit(int index) +{ + return 1llu << (index / 2); +} + +static void timer_destroy_cb(pl_gpu gpu, pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_assert(!timer->pending); + vk->DestroyQueryPool(vk->dev, timer->qpool, PL_VK_ALLOC); + pl_free(timer); +} + +static pl_timer vk_timer_create(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_timer timer = pl_alloc_ptr(NULL, timer); + *timer = (struct pl_timer_t) {0}; + + struct VkQueryPoolCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = QUERY_POOL_SIZE, + }; + + VK(vk->CreateQueryPool(vk->dev, &qinfo, PL_VK_ALLOC, &timer->qpool)); + return timer; + +error: + timer_destroy_cb(gpu, timer); + return NULL; +} + +static void vk_timer_destroy(pl_gpu gpu, pl_timer timer) +{ + vk_gpu_idle_callback(gpu, (vk_cb) timer_destroy_cb, gpu, timer); +} + +static uint64_t vk_timer_query(pl_gpu gpu, pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + if (timer->index_read == timer->index_write) + return 0; // no more unprocessed results + + vk_poll_commands(vk, 0); + if (timer->pending & timer_bit(timer->index_read)) + return 0; // still waiting for results + + VkResult res; + uint64_t ts[2] = {0}; + res = vk->GetQueryPoolResults(vk->dev, timer->qpool, timer->index_read, 2, + sizeof(ts), &ts[0], sizeof(uint64_t), + VK_QUERY_RESULT_64_BIT); + + switch (res) { + case VK_SUCCESS: + timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE; + return (ts[1] - ts[0]) * vk->props.limits.timestampPeriod; + case VK_NOT_READY: + return 0; + default: + PL_VK_ASSERT(res, "Retrieving query pool results"); + } + +error: + return 0; +} + +static void timer_begin(pl_gpu gpu, struct vk_cmd *cmd, pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + if (!timer) + return; + + if (!cmd->pool->props.timestampValidBits) { + PL_TRACE(gpu, "QF %d does not support timestamp queries", cmd->pool->qf); + return; + } + + vk_poll_commands(vk, 0); + if (timer->pending & timer_bit(timer->index_write)) + return; // next query is still running, skip this timer + + VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT; + if (cmd->pool->props.queueFlags & reset_flags) { + // Use direct command buffer resets + vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2); + } else { + // Use host query reset + vk->ResetQueryPool(vk->dev, timer->qpool, timer->index_write, 2); + } + + vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + timer->qpool, timer->index_write); + + p->cmd_timer = timer; +} + +static inline bool supports_marks(struct vk_cmd *cmd) { + // Spec says debug markers are only available on graphics/compute queues + VkQueueFlags flags = cmd->pool->props.queueFlags; + return flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT); +} + +struct vk_cmd *_begin_cmd(pl_gpu gpu, enum queue_type type, const char *label, + pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_mutex_lock(&p->recording); + + struct vk_cmdpool *pool; + switch (type) { + case ANY: pool = p->cmd ? p->cmd->pool : vk->pool_graphics; break; + case GRAPHICS: pool = vk->pool_graphics; break; + case COMPUTE: pool = vk->pool_compute; break; + case TRANSFER: pool = vk->pool_transfer; break; + default: pl_unreachable(); + } + + if (!p->cmd || p->cmd->pool != pool) { + vk_cmd_submit(&p->cmd); + p->cmd = vk_cmd_begin(pool, label); + if (!p->cmd) { + pl_mutex_unlock(&p->recording); + return NULL; + } + } + + if (vk->CmdBeginDebugUtilsLabelEXT && supports_marks(p->cmd)) { + vk->CmdBeginDebugUtilsLabelEXT(p->cmd->buf, &(VkDebugUtilsLabelEXT) { + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + .pLabelName = label, + }); + } + + timer_begin(gpu, p->cmd, timer); + return p->cmd; +} + +static void timer_end_cb(void *ptimer, void *pindex) +{ + pl_timer timer = ptimer; + int index = (uintptr_t) pindex; + timer->pending &= ~timer_bit(index); +} + +bool _end_cmd(pl_gpu gpu, struct vk_cmd **pcmd, bool submit) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + bool ret = true; + if (!pcmd) { + if (submit) { + pl_mutex_lock(&p->recording); + ret = vk_cmd_submit(&p->cmd); + pl_mutex_unlock(&p->recording); + } + return ret; + } + + struct vk_cmd *cmd = *pcmd; + pl_assert(p->cmd == cmd); + + if (p->cmd_timer) { + pl_timer timer = p->cmd_timer; + vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + timer->qpool, timer->index_write + 1); + + timer->pending |= timer_bit(timer->index_write); + vk_cmd_callback(cmd, (vk_cb) timer_end_cb, timer, + (void *) (uintptr_t) timer->index_write); + + timer->index_write = (timer->index_write + 2) % QUERY_POOL_SIZE; + if (timer->index_write == timer->index_read) { + // forcibly drop the least recent result to make space + timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE; + } + + p->cmd_timer = NULL; + } + + if (vk->CmdEndDebugUtilsLabelEXT && supports_marks(cmd)) + vk->CmdEndDebugUtilsLabelEXT(cmd->buf); + + if (submit) + ret = vk_cmd_submit(&p->cmd); + + pl_mutex_unlock(&p->recording); + return ret; +} + +void vk_gpu_idle_callback(pl_gpu gpu, vk_cb cb, const void *priv, const void *arg) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_mutex_lock(&p->recording); + if (p->cmd) { + vk_cmd_callback(p->cmd, cb, priv, arg); + } else { + vk_dev_callback(vk, cb, priv, arg); + } + pl_mutex_unlock(&p->recording); +} + +static void vk_gpu_destroy(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + vk_cmd_submit(&p->cmd); + vk_wait_idle(vk); + + for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) { + for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) + vk->DestroySampler(vk->dev, p->samplers[s][a], PL_VK_ALLOC); + } + + pl_spirv_destroy(&p->spirv); + pl_mutex_destroy(&p->recording); + pl_free((void *) gpu); +} + +pl_vulkan pl_vulkan_get(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->destroy == vk_gpu_destroy) { + struct pl_vk *p = (struct pl_vk *) impl; + return p->vk->vulkan; + } + + return NULL; +} + +static pl_handle_caps vk_sync_handle_caps(struct vk_ctx *vk) +{ + pl_handle_caps caps = 0; + + for (int i = 0; vk_sync_handle_list[i]; i++) { + enum pl_handle_type type = vk_sync_handle_list[i]; + + VkPhysicalDeviceExternalSemaphoreInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR, + .handleType = vk_sync_handle_type(type), + }; + + VkExternalSemaphoreProperties props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR, + }; + + vk->GetPhysicalDeviceExternalSemaphoreProperties(vk->physd, &info, &props); + VkExternalSemaphoreFeatureFlags flags = props.externalSemaphoreFeatures; + if ((props.compatibleHandleTypes & info.handleType) && + (flags & VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR)) + { + caps |= type; + } + } + + return caps; +} + +static pl_handle_caps vk_tex_handle_caps(struct vk_ctx *vk, bool import) +{ + pl_handle_caps caps = 0; + + for (int i = 0; vk_mem_handle_list[i]; i++) { + enum pl_handle_type handle_type = vk_mem_handle_list[i]; + if (handle_type == PL_HANDLE_DMA_BUF && !vk->GetImageDrmFormatModifierPropertiesEXT) { + PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: no DRM modifiers", + vk_handle_name(vk_mem_handle_type(PL_HANDLE_DMA_BUF)), + (unsigned int) PL_HANDLE_DMA_BUF); + continue; + } + + // Query whether creation of a "basic" dummy texture would work + VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT, + .drmFormatModifier = DRM_FORMAT_MOD_LINEAR, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + + VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR, + .handleType = vk_mem_handle_type(handle_type), + }; + + VkPhysicalDeviceImageFormatInfo2KHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR, + .pNext = &ext_pinfo, + .format = VK_FORMAT_R8_UNORM, + .type = VK_IMAGE_TYPE_2D, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, + }; + + if (handle_type == PL_HANDLE_DMA_BUF) { + vk_link_struct(&pinfo, &drm_pinfo); + pinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; + } + + VkExternalImageFormatPropertiesKHR ext_props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR, + }; + + VkImageFormatProperties2KHR props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR, + .pNext = &ext_props, + }; + + VkResult res; + res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props); + if (res != VK_SUCCESS) { + PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: %s", + vk_handle_name(ext_pinfo.handleType), + (unsigned int) handle_type, + vk_res_str(res)); + continue; + } + + if (vk_external_mem_check(vk, &ext_props.externalMemoryProperties, + handle_type, import)) + { + caps |= handle_type; + } + } + +#ifdef VK_EXT_metal_objects + if (vk->ExportMetalObjectsEXT && import) + caps |= PL_HANDLE_MTL_TEX | PL_HANDLE_IOSURFACE; +#endif + + return caps; +} + +static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = { + [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST, + [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR, +}; + +static inline struct pl_spirv_version get_spirv_version(const struct vk_ctx *vk) +{ + if (vk->api_ver >= VK_API_VERSION_1_3) { + const VkPhysicalDeviceMaintenance4Features *device_maintenance4; + device_maintenance4 = vk_find_struct(&vk->features, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES); + + if (device_maintenance4 && device_maintenance4->maintenance4) { + return (struct pl_spirv_version) { + .env_version = VK_API_VERSION_1_3, + .spv_version = PL_SPV_VERSION(1, 6), + }; + } + } + + pl_assert(vk->api_ver >= VK_API_VERSION_1_2); + return (struct pl_spirv_version) { + .env_version = VK_API_VERSION_1_2, + .spv_version = PL_SPV_VERSION(1, 5), + }; +} + +static const struct pl_gpu_fns pl_fns_vk; + +pl_gpu pl_gpu_create_vk(struct vk_ctx *vk) +{ + pl_assert(vk->dev); + + struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_vk); + gpu->log = vk->log; + + struct pl_vk *p = PL_PRIV(gpu); + pl_mutex_init(&p->recording); + p->vk = vk; + p->impl = pl_fns_vk; + p->spirv = pl_spirv_create(vk->log, get_spirv_version(vk)); + if (!p->spirv) + goto error; + + // Query all device properties + VkPhysicalDevicePCIBusInfoPropertiesEXT pci_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT, + }; + + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + .pNext = &pci_props, + }; + + VkPhysicalDevicePushDescriptorPropertiesKHR pushd_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR, + .pNext = &id_props, + }; + + VkPhysicalDeviceSubgroupProperties group_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES, + .pNext = &pushd_props, + }; + + VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT, + .pNext = &group_props, + }; + + VkPhysicalDeviceProperties2KHR props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &host_props, + }; + + bool is_portability = false; + +#ifdef VK_KHR_portability_subset + VkPhysicalDevicePortabilitySubsetPropertiesKHR port_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PORTABILITY_SUBSET_PROPERTIES_KHR, + .minVertexInputBindingStrideAlignment = 1, + }; + + for (int i = 0; i < vk->exts.num; i++) { + if (!strcmp(vk->exts.elem[i], VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME)) { + vk_link_struct(&props, &port_props); + is_portability = true; + break; + } + } +#endif + + vk->GetPhysicalDeviceProperties2(vk->physd, &props); + VkPhysicalDeviceLimits limits = props.properties.limits; + + // Determine GLSL features and limits + gpu->glsl = (struct pl_glsl_version) { + .version = 450, + .vulkan = true, + .compute = true, + .max_shmem_size = limits.maxComputeSharedMemorySize, + .max_group_threads = limits.maxComputeWorkGroupInvocations, + .max_group_size = { + limits.maxComputeWorkGroupSize[0], + limits.maxComputeWorkGroupSize[1], + limits.maxComputeWorkGroupSize[2], + }, + }; + + VkShaderStageFlags req_stages = VK_SHADER_STAGE_FRAGMENT_BIT | + VK_SHADER_STAGE_COMPUTE_BIT; + VkSubgroupFeatureFlags req_flags = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_BIT; + + if ((group_props.supportedStages & req_stages) == req_stages && + (group_props.supportedOperations & req_flags) == req_flags) + { + gpu->glsl.subgroup_size = group_props.subgroupSize; + } + + if (vk->features.features.shaderImageGatherExtended) { + gpu->glsl.min_gather_offset = limits.minTexelGatherOffset; + gpu->glsl.max_gather_offset = limits.maxTexelGatherOffset; + } + + const size_t max_size = vk_malloc_avail(vk->ma, 0); + gpu->limits = (struct pl_gpu_limits) { + // pl_gpu + .thread_safe = true, + .callbacks = true, + // pl_buf + .max_buf_size = max_size, + .max_ubo_size = PL_MIN(limits.maxUniformBufferRange, max_size), + .max_ssbo_size = PL_MIN(limits.maxStorageBufferRange, max_size), + .max_vbo_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT), + .max_mapped_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT), + .max_buffer_texels = PL_MIN(limits.maxTexelBufferElements, max_size), + .align_host_ptr = host_props.minImportedHostPointerAlignment, + .host_cached = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_CACHED_BIT), + // pl_tex + .max_tex_1d_dim = limits.maxImageDimension1D, + .max_tex_2d_dim = limits.maxImageDimension2D, + .max_tex_3d_dim = limits.maxImageDimension3D, + .blittable_1d_3d = true, + .buf_transfer = true, + .align_tex_xfer_pitch = limits.optimalBufferCopyRowPitchAlignment, + .align_tex_xfer_offset = pl_lcm(limits.optimalBufferCopyOffsetAlignment, 4), + // pl_pass + .max_variable_comps = 0, // vulkan doesn't support these at all + .max_constants = SIZE_MAX, + .array_size_constants = !is_portability, + .max_pushc_size = limits.maxPushConstantsSize, +#ifdef VK_KHR_portability_subset + .align_vertex_stride = port_props.minVertexInputBindingStrideAlignment, +#else + .align_vertex_stride = 1, +#endif + .max_dispatch = { + limits.maxComputeWorkGroupCount[0], + limits.maxComputeWorkGroupCount[1], + limits.maxComputeWorkGroupCount[2], + }, + .fragment_queues = vk->pool_graphics->num_queues, + .compute_queues = vk->pool_compute->num_queues, + }; + + gpu->export_caps.buf = vk_malloc_handle_caps(vk->ma, false); + gpu->import_caps.buf = vk_malloc_handle_caps(vk->ma, true); + gpu->export_caps.tex = vk_tex_handle_caps(vk, false); + gpu->import_caps.tex = vk_tex_handle_caps(vk, true); + gpu->export_caps.sync = vk_sync_handle_caps(vk); + gpu->import_caps.sync = 0; // Not supported yet + + if (pl_gpu_supports_interop(gpu)) { + pl_static_assert(sizeof(gpu->uuid) == VK_UUID_SIZE); + memcpy(gpu->uuid, id_props.deviceUUID, sizeof(gpu->uuid)); + + gpu->pci.domain = pci_props.pciDomain; + gpu->pci.bus = pci_props.pciBus; + gpu->pci.device = pci_props.pciDevice; + gpu->pci.function = pci_props.pciFunction; + } + + if (vk->CmdPushDescriptorSetKHR) + p->max_push_descriptors = pushd_props.maxPushDescriptors; + + vk_setup_formats(gpu); + + // Compute the correct minimum texture alignment + p->min_texel_alignment = 1; + for (int i = 0; i < gpu->num_formats; i++) { + if (gpu->formats[i]->emulated || gpu->formats[i]->opaque) + continue; + size_t texel_size = gpu->formats[i]->texel_size; + p->min_texel_alignment = pl_lcm(p->min_texel_alignment, texel_size); + } + PL_DEBUG(gpu, "Minimum texel alignment: %zu", p->min_texel_alignment); + + // Initialize the samplers + for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) { + for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) { + static const VkSamplerAddressMode modes[PL_TEX_ADDRESS_MODE_COUNT] = { + [PL_TEX_ADDRESS_CLAMP] = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + [PL_TEX_ADDRESS_REPEAT] = VK_SAMPLER_ADDRESS_MODE_REPEAT, + [PL_TEX_ADDRESS_MIRROR] = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, + }; + + VkSamplerCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = filters[s], + .minFilter = filters[s], + .addressModeU = modes[a], + .addressModeV = modes[a], + .addressModeW = modes[a], + .maxAnisotropy = 1.0, + }; + + VK(vk->CreateSampler(vk->dev, &sinfo, PL_VK_ALLOC, &p->samplers[s][a])); + } + } + + return pl_gpu_finalize(gpu); + +error: + vk_gpu_destroy(gpu); + return NULL; +} + +static void vk_sync_destroy(pl_gpu gpu, pl_sync sync) +{ + if (!sync) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + +#ifdef PL_HAVE_UNIX + if (sync->handle_type == PL_HANDLE_FD) { + if (sync->wait_handle.fd > -1) + close(sync->wait_handle.fd); + if (sync->signal_handle.fd > -1) + close(sync->signal_handle.fd); + } +#endif +#ifdef PL_HAVE_WIN32 + if (sync->handle_type == PL_HANDLE_WIN32) { + if (sync->wait_handle.handle != NULL) + CloseHandle(sync->wait_handle.handle); + if (sync->signal_handle.handle != NULL) + CloseHandle(sync->signal_handle.handle); + } + // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. +#endif + + vk->DestroySemaphore(vk->dev, sync_vk->wait, PL_VK_ALLOC); + vk->DestroySemaphore(vk->dev, sync_vk->signal, PL_VK_ALLOC); + + pl_free((void *) sync); +} + +void vk_sync_deref(pl_gpu gpu, pl_sync sync) +{ + if (!sync) + return; + + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + if (pl_rc_deref(&sync_vk->rc)) + vk_sync_destroy(gpu, sync); +} + +static pl_sync vk_sync_create(pl_gpu gpu, enum pl_handle_type handle_type) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + struct pl_sync_t *sync = pl_zalloc_obj(NULL, sync, struct pl_sync_vk); + sync->handle_type = handle_type; + + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + pl_rc_init(&sync_vk->rc); + + VkExportSemaphoreCreateInfoKHR einfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR, + .handleTypes = vk_sync_handle_type(handle_type), + }; + + switch (handle_type) { + case PL_HANDLE_FD: + sync->wait_handle.fd = -1; + sync->signal_handle.fd = -1; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + sync->wait_handle.handle = NULL; + sync->signal_handle.handle = NULL; + break; + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + pl_unreachable(); + } + + const VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &einfo, + }; + + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->wait)); + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->signal)); + PL_VK_NAME(SEMAPHORE, sync_vk->wait, "sync wait"); + PL_VK_NAME(SEMAPHORE, sync_vk->signal, "sync signal"); + +#ifdef PL_HAVE_UNIX + if (handle_type == PL_HANDLE_FD) { + VkSemaphoreGetFdInfoKHR finfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .semaphore = sync_vk->wait, + .handleType = einfo.handleTypes, + }; + + VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->wait_handle.fd)); + + finfo.semaphore = sync_vk->signal; + VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->signal_handle.fd)); + } +#endif + +#ifdef PL_HAVE_WIN32 + if (handle_type == PL_HANDLE_WIN32 || + handle_type == PL_HANDLE_WIN32_KMT) + { + VkSemaphoreGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = sync_vk->wait, + .handleType = einfo.handleTypes, + }; + + VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info, + &sync->wait_handle.handle)); + + handle_info.semaphore = sync_vk->signal; + VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info, + &sync->signal_handle.handle)); + } +#endif + + return sync; + +error: + vk_sync_destroy(gpu, sync); + return NULL; +} + +void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore) +{ + VkSemaphore sem = *semaphore; + if (!sem) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC); + *semaphore = VK_NULL_HANDLE; +} + +VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_assert(PL_ISPOT(params->export_handle)); + if ((params->export_handle & gpu->export_caps.sync) != params->export_handle) { + PL_ERR(gpu, "Invalid handle type 0x%"PRIx64" specified for " + "`pl_vulkan_sem_create`!", (uint64_t) params->export_handle); + return VK_NULL_HANDLE; + } + + switch (params->export_handle) { + case PL_HANDLE_FD: + params->out_handle->fd = -1; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + params->out_handle->handle = NULL; + break; + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + pl_unreachable(); + } + + const VkExportSemaphoreCreateInfoKHR einfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR, + .handleTypes = vk_sync_handle_type(params->export_handle), + }; + + const VkSemaphoreTypeCreateInfo stinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, + .pNext = params->export_handle ? &einfo : NULL, + .semaphoreType = params->type, + .initialValue = params->initial_value, + }; + + const VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &stinfo, + }; + + VkSemaphore sem = VK_NULL_HANDLE; + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sem)); + PL_VK_NAME(SEMAPHORE, sem, PL_DEF(params->debug_tag, "pl_vulkan_sem")); + +#ifdef PL_HAVE_UNIX + if (params->export_handle == PL_HANDLE_FD) { + VkSemaphoreGetFdInfoKHR finfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .handleType = einfo.handleTypes, + .semaphore = sem, + }; + + VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, ¶ms->out_handle->fd)); + } +#endif + +#ifdef PL_HAVE_WIN32 + if (params->export_handle == PL_HANDLE_WIN32 || + params->export_handle == PL_HANDLE_WIN32_KMT) + { + VkSemaphoreGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .handleType = einfo.handleTypes, + .semaphore = sem, + }; + + VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info, + ¶ms->out_handle->handle)); + } +#endif + + return sem; + +error: +#ifdef PL_HAVE_UNIX + if (params->export_handle == PL_HANDLE_FD) { + if (params->out_handle->fd > -1) + close(params->out_handle->fd); + } +#endif +#ifdef PL_HAVE_WIN32 + if (params->export_handle == PL_HANDLE_WIN32) { + if (params->out_handle->handle != NULL) + CloseHandle(params->out_handle->handle); + } + // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. +#endif + vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC); + return VK_NULL_HANDLE; +} + +static void vk_gpu_flush(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + CMD_SUBMIT(NULL); + vk_rotate_queues(vk); + vk_malloc_garbage_collect(vk->ma); +} + +static void vk_gpu_finish(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + CMD_SUBMIT(NULL); + vk_wait_idle(vk); +} + +static bool vk_gpu_is_failed(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + return vk->failed; +} + +struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_mutex_lock(&p->recording); + struct vk_cmd *cmd = p->cmd; + p->cmd = NULL; + pl_mutex_unlock(&p->recording); + + struct vk_cmdpool *pool = vk->pool_graphics; + if (!cmd || cmd->pool != pool) { + vk_cmd_submit(&cmd); + cmd = vk_cmd_begin(pool, NULL); + } + + return cmd; +} + +void pl_vk_print_heap(pl_gpu gpu, enum pl_log_level lev) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + vk_malloc_print_stats(vk->ma, lev); +} + +static const struct pl_gpu_fns pl_fns_vk = { + .destroy = vk_gpu_destroy, + .tex_create = vk_tex_create, + .tex_destroy = vk_tex_deref, + .tex_invalidate = vk_tex_invalidate, + .tex_clear_ex = vk_tex_clear_ex, + .tex_blit = vk_tex_blit, + .tex_upload = vk_tex_upload, + .tex_download = vk_tex_download, + .tex_poll = vk_tex_poll, + .tex_export = vk_tex_export, + .buf_create = vk_buf_create, + .buf_destroy = vk_buf_deref, + .buf_write = vk_buf_write, + .buf_read = vk_buf_read, + .buf_copy = vk_buf_copy, + .buf_export = vk_buf_export, + .buf_poll = vk_buf_poll, + .desc_namespace = vk_desc_namespace, + .pass_create = vk_pass_create, + .pass_destroy = vk_pass_destroy, + .pass_run = vk_pass_run, + .sync_create = vk_sync_create, + .sync_destroy = vk_sync_deref, + .timer_create = vk_timer_create, + .timer_destroy = vk_timer_destroy, + .timer_query = vk_timer_query, + .gpu_flush = vk_gpu_flush, + .gpu_finish = vk_gpu_finish, + .gpu_is_failed = vk_gpu_is_failed, +}; diff --git a/src/vulkan/gpu.h b/src/vulkan/gpu.h new file mode 100644 index 0000000..041de13 --- /dev/null +++ b/src/vulkan/gpu.h @@ -0,0 +1,175 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" +#include "command.h" +#include "formats.h" +#include "malloc.h" +#include "utils.h" + +#include "../gpu.h" +#include "../glsl/spirv.h" +#include "../pl_thread.h" + +pl_gpu pl_gpu_create_vk(struct vk_ctx *vk); + +// This function takes the current graphics command and steals it from the +// GPU, so the caller can do custom vk_cmd_ calls on it. The caller should +// submit it as well. +struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu); + +// Print memory usage statistics +void pl_vk_print_heap(pl_gpu, enum pl_log_level); + +// --- pl_gpu internal structs and helpers + +struct pl_fmt_vk { + const struct vk_format *vk_fmt; + bool blit_emulated; +}; + +enum queue_type { + GRAPHICS, + COMPUTE, + TRANSFER, + ANY, +}; + +struct pl_vk { + struct pl_gpu_fns impl; + struct vk_ctx *vk; + pl_spirv spirv; + + // Some additional cached device limits and features checks + uint32_t max_push_descriptors; + size_t min_texel_alignment; + + // The "currently recording" command. This will be queued and replaced by + // a new command every time we need to "switch" between queue families. + pl_mutex recording; + struct vk_cmd *cmd; + pl_timer cmd_timer; + + // Array of VkSamplers for every combination of sample/address modes + VkSampler samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT]; + + // To avoid spamming warnings + bool warned_modless; +}; + +struct vk_cmd *_begin_cmd(pl_gpu, enum queue_type, const char *label, pl_timer); +bool _end_cmd(pl_gpu, struct vk_cmd **, bool submit); + +#define CMD_BEGIN(type) _begin_cmd(gpu, type, __func__, NULL) +#define CMD_BEGIN_TIMED(type, timer) _begin_cmd(gpu, type, __func__, timer) +#define CMD_FINISH(cmd) _end_cmd(gpu, cmd, false) +#define CMD_SUBMIT(cmd) _end_cmd(gpu, cmd, true) + +// Helper to fire a callback the next time the `pl_gpu` is in an idle state +// +// Use this instead of `vk_dev_callback` when you need to clean up after +// resources that might possibly still be in use by the `pl_gpu` at the time of +// creating the callback. +void vk_gpu_idle_callback(pl_gpu, vk_cb, const void *priv, const void *arg); + +struct pl_tex_vk { + pl_rc_t rc; + bool external_img; + enum queue_type transfer_queue; + VkImageType type; + VkImage img; + VkImageAspectFlags aspect; + struct vk_memslice mem; + // cached properties + VkFormat img_fmt; + VkImageUsageFlags usage_flags; + // for sampling + VkImageView view; + // for rendering + VkFramebuffer framebuffer; + // for vk_tex_upload/download fallback code + pl_fmt texel_fmt; + // for planar textures (as a convenience) + int num_planes; + struct pl_tex_vk *planes[4]; + + // synchronization and current state (planes only) + struct vk_sem sem; + VkImageLayout layout; + PL_ARRAY(pl_vulkan_sem) ext_deps; // external semaphore, not owned by the pl_tex + pl_sync ext_sync; // indicates an exported image + uint32_t qf; // last queue family to access this texture (for barriers) + bool may_invalidate; + bool held; +}; + +pl_tex vk_tex_create(pl_gpu, const struct pl_tex_params *); +void vk_tex_deref(pl_gpu, pl_tex); +void vk_tex_invalidate(pl_gpu, pl_tex); +void vk_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color); +void vk_tex_blit(pl_gpu, const struct pl_tex_blit_params *); +bool vk_tex_upload(pl_gpu, const struct pl_tex_transfer_params *); +bool vk_tex_download(pl_gpu, const struct pl_tex_transfer_params *); +bool vk_tex_poll(pl_gpu, pl_tex, uint64_t timeout); +bool vk_tex_export(pl_gpu, pl_tex, pl_sync); +void vk_tex_barrier(pl_gpu, struct vk_cmd *, pl_tex, VkPipelineStageFlags2, + VkAccessFlags2, VkImageLayout, uint32_t qf); + +struct pl_buf_vk { + pl_rc_t rc; + struct vk_memslice mem; + enum queue_type update_queue; + VkBufferView view; // for texel buffers + + // synchronization and current state + struct vk_sem sem; + bool exported; + bool needs_flush; +}; + +pl_buf vk_buf_create(pl_gpu, const struct pl_buf_params *); +void vk_buf_deref(pl_gpu, pl_buf); +void vk_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size); +bool vk_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size); +void vk_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size); +bool vk_buf_export(pl_gpu, pl_buf); +bool vk_buf_poll(pl_gpu, pl_buf, uint64_t timeout); + +// Helper to ease buffer barrier creation. (`offset` is relative to pl_buf) +void vk_buf_barrier(pl_gpu, struct vk_cmd *, pl_buf, VkPipelineStageFlags2, + VkAccessFlags2, size_t offset, size_t size, bool export); + +// Flush visible writes to a buffer made by the API +void vk_buf_flush(pl_gpu, struct vk_cmd *, pl_buf, size_t offset, size_t size); + +struct pl_pass_vk; + +int vk_desc_namespace(pl_gpu, enum pl_desc_type); +pl_pass vk_pass_create(pl_gpu, const struct pl_pass_params *); +void vk_pass_destroy(pl_gpu, pl_pass); +void vk_pass_run(pl_gpu, const struct pl_pass_run_params *); + +struct pl_sync_vk { + pl_rc_t rc; + VkSemaphore wait; + VkSemaphore signal; +}; + +void vk_sync_deref(pl_gpu, pl_sync); diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c new file mode 100644 index 0000000..2f317bc --- /dev/null +++ b/src/vulkan/gpu_buf.c @@ -0,0 +1,470 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" + +void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, + VkPipelineStageFlags2 stage, VkAccessFlags2 access, + size_t offset, size_t size, bool export) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers + pl_rc_ref(&buf_vk->rc); + + bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped || + buf->params.import_handle == PL_HANDLE_HOST_PTR; + bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent; + if (needs_flush && noncoherent) { + VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = buf_vk->mem.vkmem, + .offset = buf_vk->mem.map_offset, + .size = buf_vk->mem.map_size, + })); + + // Just ignore errors, not much we can do about them other than + // logging them and moving on... + error: ; + } + + struct vk_sync_scope last; + last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export); + + // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE + // buffers require transitioning to/from the concrete QF index + uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf; + uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; + uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; + + if (last.access || src_qf != dst_qf) { + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = last.stage, + .srcAccessMask = last.access, + .dstStageMask = stage, + .dstAccessMask = access, + .srcQueueFamilyIndex = src_qf, + .dstQueueFamilyIndex = dst_qf, + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset + offset, + .size = size, + }, + }); + } + + buf_vk->needs_flush = false; + buf_vk->exported = export; + vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf); +} + +void vk_buf_deref(pl_gpu gpu, pl_buf buf) +{ + if (!buf) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + if (pl_rc_deref(&buf_vk->rc)) { + vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC); + vk_malloc_free(vk->ma, &buf_vk->mem); + pl_free((void *) buf); + } +} + +pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk); + buf->params = *params; + buf->params.initial_data = NULL; + + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_rc_init(&buf_vk->rc); + + struct vk_malloc_params mparams = { + .reqs = { + .size = PL_ALIGN2(params->size, 4), // for vk_buf_write + .memoryTypeBits = UINT32_MAX, + .alignment = 1, + }, + // these are always set, because `vk_buf_copy` can always be used + .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .export_handle = params->export_handle, + .import_handle = params->import_handle, + .shared_mem = params->shared_mem, + .debug_tag = params->debug_tag, + }; + + // Mandatory/optimal buffer offset alignment + VkDeviceSize *align = &mparams.reqs.alignment; + VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment; + + // Try and align all buffers to the minimum texel alignment, to make sure + // tex_upload/tex_download always gets aligned buffer copies if possible + extra_align = pl_lcm(extra_align, p->min_texel_alignment); + + enum pl_buf_mem_type mem_type = params->memory_type; + bool is_texel = false; + + if (params->uniform) { + mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment); + mem_type = PL_BUF_MEM_DEVICE; + if (params->format) { + mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; + is_texel = true; + } + } + + if (params->storable) { + mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment); + buf_vk->update_queue = COMPUTE; + mem_type = PL_BUF_MEM_DEVICE; + if (params->format) { + mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; + is_texel = true; + } + } + + if (is_texel) { + *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment); + *align = pl_lcm(*align, params->format->texel_size); + } + + if (params->drawable) { + mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT; + mem_type = PL_BUF_MEM_DEVICE; + } + + if (params->host_writable || params->initial_data) { + // Buffers should be written using mapped memory if possible + mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + // Use the transfer queue for updates on very large buffers (1 MB) + if (params->size > 1024*1024) + buf_vk->update_queue = TRANSFER; + } + + if (params->host_mapped || params->host_readable) { + mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + + if (params->size > 1024) { + // Prefer cached memory for large buffers (1 kB) which may be read + // from, because uncached reads are extremely slow + mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + } + + switch (mem_type) { + case PL_BUF_MEM_AUTO: + // We generally prefer VRAM since it's faster than RAM, but any number + // of other requirements could potentially exclude it, so just mark it + // as optimal by default. + if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) + mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + case PL_BUF_MEM_DEVICE: + // Force device local memory. + mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + case PL_BUF_MEM_HOST: + // This isn't a true guarantee, but actually trying to restrict the + // device-local bit locks out all memory heaps on iGPUs. Requiring + // the memory be host-mapped is the easiest compromise. + mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + break; + case PL_BUF_MEM_TYPE_COUNT: + pl_unreachable(); + } + + if (params->import_handle) { + size_t offset = params->shared_mem.offset; + if (PL_ALIGN(offset, *align) != offset) { + PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment " + "requirement of enabled usage flags (%zu)!", + offset, (size_t) *align); + goto error; + } + } else { + *align = pl_lcm(*align, extra_align); + } + + if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams)) + goto error; + + if (params->host_mapped) + buf->data = buf_vk->mem.data; + + if (params->export_handle) { + buf->shared_mem = buf_vk->mem.shared_mem; + buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR; + buf_vk->exported = true; + } + + if (is_texel) { + struct pl_fmt_vk *fmtp = PL_PRIV(params->format); + VkBufferViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .buffer = buf_vk->mem.buf, + .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt), + .offset = buf_vk->mem.offset, + .range = buf_vk->mem.size, + }; + + VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view)); + PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel")); + } + + if (params->initial_data) + vk_buf_write(gpu, buf, 0, params->initial_data, params->size); + + return buf; + +error: + vk_buf_deref(gpu, buf); + return NULL; +} + +static void invalidate_buf(pl_gpu gpu, pl_buf buf) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + if (buf_vk->mem.data && !buf_vk->mem.coherent) { + VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = buf_vk->mem.vkmem, + .offset = buf_vk->mem.map_offset, + .size = buf_vk->mem.map_size, + })); + } + + // Ignore errors (after logging), nothing useful we can do anyway +error: ; + vk_buf_deref(gpu, buf); +} + +void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, + size_t offset, size_t size) +{ + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // We need to perform a flush if the host is capable of reading back from + // the buffer, or if we intend to overwrite it using mapped memory + bool can_read = buf->params.host_readable; + bool can_write = buf_vk->mem.data && buf->params.host_writable; + if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR) + can_read = can_write = true; + + if (!can_read && !can_write) + return; + + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = buf_vk->sem.write.stage, + .srcAccessMask = buf_vk->sem.write.access, + .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, + .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0) + | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0), + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset + offset, + .size = size, + }, + }); + + // We need to hold on to the buffer until this barrier completes + vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf); + pl_rc_ref(&buf_vk->rc); +} + +bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // Opportunistically check if we can re-use this buffer without flush + vk_poll_commands(vk, 0); + if (pl_rc_count(&buf_vk->rc) == 1) + return false; + + // Otherwise, we're force to submit any queued command so that the + // user is guaranteed to see progress eventually, even if they call + // this in a tight loop + CMD_SUBMIT(NULL); + vk_poll_commands(vk, timeout); + + return pl_rc_count(&buf_vk->rc) > 1; +} + +void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, + const void *data, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // For host-mapped buffers, we can just directly memcpy the buffer contents. + // Otherwise, we can update the buffer from the GPU using a command buffer. + if (buf_vk->mem.data) { + // ensure no queued operations + while (vk_buf_poll(gpu, buf, UINT64_MAX)) + ; // do nothing + + uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset; + memcpy((void *) addr, data, size); + buf_vk->needs_flush = true; + } else { + struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue); + if (!cmd) { + PL_ERR(gpu, "Failed updating buffer!"); + return; + } + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false); + + // Vulkan requires `size` to be a multiple of 4, so we need to make + // sure to handle the end separately if the original data is not + const size_t max_transfer = 64 * 1024; + size_t size_rem = size % 4; + size_t size_base = size - size_rem; + VkDeviceSize buf_offset = buf_vk->mem.offset + offset; + + if (size_base > max_transfer) { + PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload " + "large buffer. Consider using buffer-buffer transfers " + "instead!"); + } + + for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) { + vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, + buf_offset + xfer, + PL_MIN(size_base, max_transfer), + (void *) ((uint8_t *) data + xfer)); + } + + if (size_rem) { + uint8_t tail[4] = {0}; + memcpy(tail, data, size_rem); + vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base, + sizeof(tail), tail); + } + + pl_assert(!buf->params.host_readable); // no flush needed due to this + CMD_FINISH(&cmd); + } +} + +bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_assert(buf_vk->mem.data); + + if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) { + // ensure no more queued writes + VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &buf_vk->sem.write.sync.sem, + .pValues = &buf_vk->sem.write.sync.value, + }, UINT64_MAX)); + + // process callbacks + vk_poll_commands(vk, 0); + } + + uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset; + memcpy(dest, (void *) addr, size); + return true; + +error: + return false; +} + +void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *dst_vk = PL_PRIV(dst); + struct pl_buf_vk *src_vk = PL_PRIV(src); + + struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue); + if (!cmd) { + PL_ERR(gpu, "Failed copying buffer!"); + return; + } + + vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false); + vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false); + + VkBufferCopy region = { + .srcOffset = src_vk->mem.offset + src_offset, + .dstOffset = dst_vk->mem.offset + dst_offset, + .size = size, + }; + + vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf, + 1, ®ion); + + vk_buf_flush(gpu, cmd, dst, dst_offset, size); + CMD_FINISH(&cmd); +} + +bool vk_buf_export(pl_gpu gpu, pl_buf buf) +{ + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + if (buf_vk->exported) + return true; + + struct vk_cmd *cmd = CMD_BEGIN(ANY); + if (!cmd) { + PL_ERR(gpu, "Failed exporting buffer!"); + return false; + } + + // For the queue family ownership transfer, we can ignore all pipeline + // stages since the synchronization via fences/semaphores is required + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0, + buf->params.size, true); + + + return CMD_SUBMIT(&cmd); +} diff --git a/src/vulkan/gpu_pass.c b/src/vulkan/gpu_pass.c new file mode 100644 index 0000000..5ffe77d --- /dev/null +++ b/src/vulkan/gpu_pass.c @@ -0,0 +1,964 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "cache.h" +#include "glsl/spirv.h" + +// For pl_pass.priv +struct pl_pass_vk { + // Pipeline / render pass + VkPipeline base; + VkPipeline pipe; + VkPipelineLayout pipeLayout; + VkRenderPass renderPass; + // Descriptor set (bindings) + bool use_pushd; + VkDescriptorSetLayout dsLayout; + VkDescriptorPool dsPool; + // To keep track of which descriptor sets are and aren't available, we + // allocate a fixed number and use a bitmask of all available sets. + VkDescriptorSet dss[16]; + uint16_t dmask; + + // For recompilation + VkVertexInputAttributeDescription *attrs; + VkPipelineCache cache; + VkShaderModule vert; + VkShaderModule shader; + + // For updating + VkWriteDescriptorSet *dswrite; + VkDescriptorImageInfo *dsiinfo; + VkDescriptorBufferInfo *dsbinfo; + VkSpecializationInfo specInfo; + size_t spec_size; +}; + +int vk_desc_namespace(pl_gpu gpu, enum pl_desc_type type) +{ + return 0; +} + +static void pass_destroy_cb(pl_gpu gpu, pl_pass pass) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + + vk->DestroyPipeline(vk->dev, pass_vk->pipe, PL_VK_ALLOC); + vk->DestroyPipeline(vk->dev, pass_vk->base, PL_VK_ALLOC); + vk->DestroyRenderPass(vk->dev, pass_vk->renderPass, PL_VK_ALLOC); + vk->DestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, PL_VK_ALLOC); + vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC); + vk->DestroyDescriptorPool(vk->dev, pass_vk->dsPool, PL_VK_ALLOC); + vk->DestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, PL_VK_ALLOC); + vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC); + vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC); + + pl_free((void *) pass); +} + +void vk_pass_destroy(pl_gpu gpu, pl_pass pass) +{ + vk_gpu_idle_callback(gpu, (vk_cb) pass_destroy_cb, gpu, pass); +} + +static const VkDescriptorType dsType[] = { + [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + [PL_DESC_STORAGE_IMG] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + [PL_DESC_BUF_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + [PL_DESC_BUF_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + [PL_DESC_BUF_TEXEL_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + [PL_DESC_BUF_TEXEL_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, +}; + +static VkResult vk_compile_glsl(pl_gpu gpu, void *alloc, + enum glsl_shader_stage stage, + const char *shader, + pl_cache_obj *out_spirv) +{ + struct pl_vk *p = PL_PRIV(gpu); + pl_cache cache = pl_gpu_cache(gpu); + uint64_t key = CACHE_KEY_SPIRV; + if (cache) { // skip computing key if `cache + pl_hash_merge(&key, p->spirv->signature); + pl_hash_merge(&key, pl_str0_hash(shader)); + out_spirv->key = key; + if (pl_cache_get(cache, out_spirv)) { + PL_DEBUG(gpu, "Re-using cached SPIR-V object 0x%"PRIx64, key); + return VK_SUCCESS; + } + } + + pl_clock_t start = pl_clock_now(); + pl_str spirv = pl_spirv_compile_glsl(p->spirv, alloc, gpu->glsl, stage, shader); + pl_log_cpu_time(gpu->log, start, pl_clock_now(), "translating SPIR-V"); + out_spirv->data = spirv.buf; + out_spirv->size = spirv.len; + out_spirv->free = pl_free; + return spirv.len ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED; +} + +static const VkShaderStageFlags stageFlags[] = { + [PL_PASS_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT | + VK_SHADER_STAGE_VERTEX_BIT, + [PL_PASS_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT, +}; + +static void destroy_pipeline(struct vk_ctx *vk, void *pipeline) +{ + vk->DestroyPipeline(vk->dev, vk_unwrap_handle(pipeline), PL_VK_ALLOC); +} + +static VkResult vk_recreate_pipelines(struct vk_ctx *vk, pl_pass pass, + bool derivable, VkPipeline base, + VkPipeline *out_pipe) +{ + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + const struct pl_pass_params *params = &pass->params; + + // The old pipeline might still be in use, so we have to destroy it + // asynchronously with a device idle callback + if (*out_pipe) { + // We don't need to use `vk_gpu_idle_callback` because the only command + // that can access a VkPipeline, `vk_pass_run`, always flushes `p->cmd`. + vk_dev_callback(vk, (vk_cb) destroy_pipeline, vk, vk_wrap_handle(*out_pipe)); + *out_pipe = VK_NULL_HANDLE; + } + + VkPipelineCreateFlags flags = 0; + if (derivable) + flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT; + if (base) + flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT; + + const VkSpecializationInfo *specInfo = &pass_vk->specInfo; + if (!specInfo->dataSize) + specInfo = NULL; + + switch (params->type) { + case PL_PASS_RASTER: { + static const VkBlendFactor blendFactors[] = { + [PL_BLEND_ZERO] = VK_BLEND_FACTOR_ZERO, + [PL_BLEND_ONE] = VK_BLEND_FACTOR_ONE, + [PL_BLEND_SRC_ALPHA] = VK_BLEND_FACTOR_SRC_ALPHA, + [PL_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + }; + + VkPipelineColorBlendAttachmentState blendState = { + .colorBlendOp = VK_BLEND_OP_ADD, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | + VK_COLOR_COMPONENT_A_BIT, + }; + + const struct pl_blend_params *blend = params->blend_params; + if (blend) { + blendState.blendEnable = true; + blendState.srcColorBlendFactor = blendFactors[blend->src_rgb]; + blendState.dstColorBlendFactor = blendFactors[blend->dst_rgb]; + blendState.srcAlphaBlendFactor = blendFactors[blend->src_alpha]; + blendState.dstAlphaBlendFactor = blendFactors[blend->dst_alpha]; + } + + static const VkPrimitiveTopology topologies[PL_PRIM_TYPE_COUNT] = { + [PL_PRIM_TRIANGLE_LIST] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + [PL_PRIM_TRIANGLE_STRIP] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + }; + + VkGraphicsPipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .flags = flags, + .stageCount = 2, + .pStages = (VkPipelineShaderStageCreateInfo[]) { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = pass_vk->vert, + .pName = "main", + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = pass_vk->shader, + .pName = "main", + .pSpecializationInfo = specInfo, + } + }, + .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) { + .binding = 0, + .stride = params->vertex_stride, + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX, + }, + .vertexAttributeDescriptionCount = params->num_vertex_attribs, + .pVertexAttributeDescriptions = pass_vk->attrs, + }, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = topologies[params->vertex_type], + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .scissorCount = 1, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .lineWidth = 1.0f, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &blendState, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 2, + .pDynamicStates = (VkDynamicState[]){ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }, + }, + .layout = pass_vk->pipeLayout, + .renderPass = pass_vk->renderPass, + .basePipelineHandle = base, + .basePipelineIndex = -1, + }; + + return vk->CreateGraphicsPipelines(vk->dev, pass_vk->cache, 1, &cinfo, + PL_VK_ALLOC, out_pipe); + } + + case PL_PASS_COMPUTE: { + VkComputePipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .flags = flags, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = pass_vk->shader, + .pName = "main", + .pSpecializationInfo = specInfo, + }, + .layout = pass_vk->pipeLayout, + .basePipelineHandle = base, + .basePipelineIndex = -1, + }; + + return vk->CreateComputePipelines(vk->dev, pass_vk->cache, 1, &cinfo, + PL_VK_ALLOC, out_pipe); + } + + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +pl_pass vk_pass_create(pl_gpu gpu, const struct pl_pass_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + bool success = false; + + struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_vk); + pass->params = pl_pass_params_copy(pass, params); + + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + pass_vk->dmask = -1; // all descriptors available + + // temporary allocations + void *tmp = pl_tmp(NULL); + + int num_desc = params->num_descriptors; + if (!num_desc) + goto no_descriptors; + if (num_desc > vk->props.limits.maxPerStageResources) { + PL_ERR(gpu, "Pass with %d descriptors exceeds the maximum number of " + "per-stage resources %" PRIu32"!", + num_desc, vk->props.limits.maxPerStageResources); + goto error; + } + + pass_vk->dswrite = pl_calloc(pass, num_desc, sizeof(VkWriteDescriptorSet)); + pass_vk->dsiinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorImageInfo)); + pass_vk->dsbinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorBufferInfo)); + +#define NUM_DS (PL_ARRAY_SIZE(pass_vk->dss)) + + int dsSize[PL_DESC_TYPE_COUNT] = {0}; + VkDescriptorSetLayoutBinding *bindings = pl_calloc_ptr(tmp, num_desc, bindings); + + uint32_t max_tex = vk->props.limits.maxPerStageDescriptorSampledImages, + max_img = vk->props.limits.maxPerStageDescriptorStorageImages, + max_ubo = vk->props.limits.maxPerStageDescriptorUniformBuffers, + max_ssbo = vk->props.limits.maxPerStageDescriptorStorageBuffers; + + uint32_t *dsLimits[PL_DESC_TYPE_COUNT] = { + [PL_DESC_SAMPLED_TEX] = &max_tex, + [PL_DESC_STORAGE_IMG] = &max_img, + [PL_DESC_BUF_UNIFORM] = &max_ubo, + [PL_DESC_BUF_STORAGE] = &max_ssbo, + [PL_DESC_BUF_TEXEL_UNIFORM] = &max_tex, + [PL_DESC_BUF_TEXEL_STORAGE] = &max_img, + }; + + for (int i = 0; i < num_desc; i++) { + struct pl_desc *desc = ¶ms->descriptors[i]; + if (!(*dsLimits[desc->type])--) { + PL_ERR(gpu, "Pass exceeds the maximum number of per-stage " + "descriptors of type %u!", (unsigned) desc->type); + goto error; + } + + dsSize[desc->type]++; + bindings[i] = (VkDescriptorSetLayoutBinding) { + .binding = desc->binding, + .descriptorType = dsType[desc->type], + .descriptorCount = 1, + .stageFlags = stageFlags[params->type], + }; + } + + VkDescriptorSetLayoutCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pBindings = bindings, + .bindingCount = num_desc, + }; + + if (p->max_push_descriptors && num_desc <= p->max_push_descriptors) { + dinfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR; + pass_vk->use_pushd = true; + } else if (p->max_push_descriptors) { + PL_INFO(gpu, "Pass with %d descriptors exceeds the maximum push " + "descriptor count (%d). Falling back to descriptor sets!", + num_desc, p->max_push_descriptors); + } + + VK(vk->CreateDescriptorSetLayout(vk->dev, &dinfo, PL_VK_ALLOC, + &pass_vk->dsLayout)); + + if (!pass_vk->use_pushd) { + PL_ARRAY(VkDescriptorPoolSize) dsPoolSizes = {0}; + + for (enum pl_desc_type t = 0; t < PL_DESC_TYPE_COUNT; t++) { + if (dsSize[t] > 0) { + PL_ARRAY_APPEND(tmp, dsPoolSizes, (VkDescriptorPoolSize) { + .type = dsType[t], + .descriptorCount = dsSize[t] * NUM_DS, + }); + } + } + + if (dsPoolSizes.num) { + VkDescriptorPoolCreateInfo pinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = NUM_DS, + .pPoolSizes = dsPoolSizes.elem, + .poolSizeCount = dsPoolSizes.num, + }; + + VK(vk->CreateDescriptorPool(vk->dev, &pinfo, PL_VK_ALLOC, &pass_vk->dsPool)); + + VkDescriptorSetLayout layouts[NUM_DS]; + for (int i = 0; i < NUM_DS; i++) + layouts[i] = pass_vk->dsLayout; + + VkDescriptorSetAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = pass_vk->dsPool, + .descriptorSetCount = NUM_DS, + .pSetLayouts = layouts, + }; + + VK(vk->AllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss)); + } + } + +no_descriptors: ; + + bool has_spec = params->num_constants; + if (has_spec) { + PL_ARRAY(VkSpecializationMapEntry) entries = {0}; + PL_ARRAY_RESIZE(pass, entries, params->num_constants); + size_t spec_size = 0; + + for (int i = 0; i < params->num_constants; i++) { + const struct pl_constant *con = ¶ms->constants[i]; + size_t con_size = pl_var_type_size(con->type); + entries.elem[i] = (VkSpecializationMapEntry) { + .constantID = con->id, + .offset = con->offset, + .size = con_size, + }; + + size_t req_size = con->offset + con_size; + spec_size = PL_MAX(spec_size, req_size); + } + + pass_vk->spec_size = spec_size; + pass_vk->specInfo = (VkSpecializationInfo) { + .mapEntryCount = params->num_constants, + .pMapEntries = entries.elem, + }; + + if (params->constant_data) { + pass_vk->specInfo.pData = pl_memdup(pass, params->constant_data, spec_size); + pass_vk->specInfo.dataSize = spec_size; + } + } + + VkPipelineLayoutCreateInfo linfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = num_desc ? 1 : 0, + .pSetLayouts = &pass_vk->dsLayout, + .pushConstantRangeCount = params->push_constants_size ? 1 : 0, + .pPushConstantRanges = &(VkPushConstantRange){ + .stageFlags = stageFlags[params->type], + .offset = 0, + .size = params->push_constants_size, + }, + }; + + VK(vk->CreatePipelineLayout(vk->dev, &linfo, PL_VK_ALLOC, + &pass_vk->pipeLayout)); + + pl_cache_obj vert = {0}, frag = {0}, comp = {0}; + switch (params->type) { + case PL_PASS_RASTER: ; + VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_VERTEX, params->vertex_shader, &vert)); + VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_FRAGMENT, params->glsl_shader, &frag)); + break; + case PL_PASS_COMPUTE: + VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_COMPUTE, params->glsl_shader, &comp)); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + // Use hash of generated SPIR-V as key for pipeline cache + const pl_cache cache = pl_gpu_cache(gpu); + pl_cache_obj pipecache = {0}; + if (cache) { + pipecache.key = CACHE_KEY_VK_PIPE; + pl_hash_merge(&pipecache.key, pl_var_hash(vk->props.pipelineCacheUUID)); + pl_hash_merge(&pipecache.key, pl_mem_hash(vert.data, vert.size)); + pl_hash_merge(&pipecache.key, pl_mem_hash(frag.data, frag.size)); + pl_hash_merge(&pipecache.key, pl_mem_hash(comp.data, comp.size)); + pl_cache_get(cache, &pipecache); + } + + if (cache || has_spec) { + // Don't create pipeline cache unless we either plan on caching the + // result of this shader to a pl_cache, or if we will possibly re-use + // it due to the presence of specialization constants + VkPipelineCacheCreateInfo pcinfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, + .pInitialData = pipecache.data, + .initialDataSize = pipecache.size, + }; + + VK(vk->CreatePipelineCache(vk->dev, &pcinfo, PL_VK_ALLOC, &pass_vk->cache)); + } + + VkShaderModuleCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + }; + + pl_clock_t start = pl_clock_now(); + switch (params->type) { + case PL_PASS_RASTER: { + sinfo.pCode = (uint32_t *) vert.data; + sinfo.codeSize = vert.size; + VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->vert)); + PL_VK_NAME(SHADER_MODULE, pass_vk->vert, "vertex"); + + sinfo.pCode = (uint32_t *) frag.data; + sinfo.codeSize = frag.size; + VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader)); + PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "fragment"); + + pass_vk->attrs = pl_calloc_ptr(pass, params->num_vertex_attribs, pass_vk->attrs); + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct pl_vertex_attrib *va = ¶ms->vertex_attribs[i]; + const struct vk_format **pfmt_vk = PL_PRIV(va->fmt); + + pass_vk->attrs[i] = (VkVertexInputAttributeDescription) { + .binding = 0, + .location = va->location, + .offset = va->offset, + .format = PL_DEF((*pfmt_vk)->bfmt, (*pfmt_vk)->tfmt), + }; + } + + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = (VkFormat) params->target_format->signature, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = pass->params.load_target + ? VK_ATTACHMENT_LOAD_OP_LOAD + : VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &pass_vk->renderPass)); + break; + } + case PL_PASS_COMPUTE: { + sinfo.pCode = (uint32_t *) comp.data; + sinfo.codeSize = comp.size; + VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader)); + PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "compute"); + break; + } + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + pl_clock_t after_compilation = pl_clock_now(); + pl_log_cpu_time(gpu->log, start, after_compilation, "compiling shader"); + + // Update cache entries on successful compilation + pl_cache_steal(cache, &vert); + pl_cache_steal(cache, &frag); + pl_cache_steal(cache, &comp); + + // Create the graphics/compute pipeline + VkPipeline *pipe = has_spec ? &pass_vk->base : &pass_vk->pipe; + VK(vk_recreate_pipelines(vk, pass, has_spec, VK_NULL_HANDLE, pipe)); + pl_log_cpu_time(gpu->log, after_compilation, pl_clock_now(), "creating pipeline"); + + // Update pipeline cache + if (cache) { + size_t size = 0; + VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, NULL)); + pl_cache_obj_resize(tmp, &pipecache, size); + VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, pipecache.data)); + pl_cache_steal(cache, &pipecache); + } + + if (!has_spec) { + // We can free these if we no longer need them for specialization + pl_free_ptr(&pass_vk->attrs); + vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC); + vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC); + vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC); + pass_vk->vert = VK_NULL_HANDLE; + pass_vk->shader = VK_NULL_HANDLE; + pass_vk->cache = VK_NULL_HANDLE; + } + + PL_DEBUG(vk, "Pass statistics: size %zu, SPIR-V: vert %zu frag %zu comp %zu", + pipecache.size, vert.size, frag.size, comp.size); + + success = true; + +error: + if (!success) { + pass_destroy_cb(gpu, pass); + pass = NULL; + } + +#undef NUM_DS + + pl_free(tmp); + return pass; +} + +static const VkPipelineStageFlags2 shaderStages[] = { + [PL_PASS_RASTER] = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, + [PL_PASS_COMPUTE] = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, +}; + +static void vk_update_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass, + struct pl_desc_binding db, + VkDescriptorSet ds, int idx) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + struct pl_desc *desc = &pass->params.descriptors[idx]; + + VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx]; + *wds = (VkWriteDescriptorSet) { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = desc->binding, + .descriptorCount = 1, + .descriptorType = dsType[desc->type], + }; + + static const VkAccessFlags2 storageAccess[PL_DESC_ACCESS_COUNT] = { + [PL_DESC_ACCESS_READONLY] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, + [PL_DESC_ACCESS_WRITEONLY] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + [PL_DESC_ACCESS_READWRITE] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + }; + + switch (desc->type) { + case PL_DESC_SAMPLED_TEX: { + pl_tex tex = db.object; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type], + VK_ACCESS_2_SHADER_SAMPLED_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .sampler = p->samplers[db.sample_mode][db.address_mode], + .imageView = tex_vk->view, + .imageLayout = tex_vk->layout, + }; + + wds->pImageInfo = iinfo; + return; + } + case PL_DESC_STORAGE_IMG: { + pl_tex tex = db.object; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type], + storageAccess[desc->access], VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .imageView = tex_vk->view, + .imageLayout = tex_vk->layout, + }; + + wds->pImageInfo = iinfo; + return; + } + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: { + pl_buf buf = db.object; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + VkAccessFlags2 access = VK_ACCESS_2_UNIFORM_READ_BIT; + if (desc->type == PL_DESC_BUF_STORAGE) + access = storageAccess[desc->access]; + + vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type], + access, 0, buf->params.size, false); + + VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx]; + *binfo = (VkDescriptorBufferInfo) { + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset, + .range = buf->params.size, + }; + + wds->pBufferInfo = binfo; + return; + } + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = db.object; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + VkAccessFlags2 access = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT; + if (desc->type == PL_DESC_BUF_TEXEL_STORAGE) + access = storageAccess[desc->access]; + + vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type], + access, 0, buf->params.size, false); + + wds->pTexelBufferView = &buf_vk->view; + return; + } + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void vk_release_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass, + struct pl_desc_binding db, int idx) +{ + const struct pl_desc *desc = &pass->params.descriptors[idx]; + + switch (desc->type) { + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + if (desc->access != PL_DESC_ACCESS_READONLY) { + pl_buf buf = db.object; + vk_buf_flush(gpu, cmd, buf, 0, buf->params.size); + } + return; + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: + return; + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void set_ds(struct pl_pass_vk *pass_vk, void *dsbit) +{ + pass_vk->dmask |= (uintptr_t) dsbit; +} + +static bool need_respec(pl_pass pass, const struct pl_pass_run_params *params) +{ + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + if (!pass_vk->spec_size || !params->constant_data) + return false; + + VkSpecializationInfo *specInfo = &pass_vk->specInfo; + size_t size = pass_vk->spec_size; + if (!specInfo->pData) { + // Shader was never specialized before + specInfo->pData = pl_memdup((void *) pass, params->constant_data, size); + specInfo->dataSize = size; + return true; + } + + // Shader is being re-specialized with new values + if (memcmp(specInfo->pData, params->constant_data, size) != 0) { + memcpy((void *) specInfo->pData, params->constant_data, size); + return true; + } + + return false; +} + +void vk_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_pass pass = params->pass; + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + + if (params->vertex_data || params->index_data) + return pl_pass_run_vbo(gpu, params); + + // Check if we need to re-specialize this pipeline + if (need_respec(pass, params)) { + pl_clock_t start = pl_clock_now(); + VK(vk_recreate_pipelines(vk, pass, false, pass_vk->base, &pass_vk->pipe)); + pl_log_cpu_time(gpu->log, start, pl_clock_now(), "re-specializing shader"); + } + + if (!pass_vk->use_pushd) { + // Wait for a free descriptor set + while (!pass_vk->dmask) { + PL_TRACE(gpu, "No free descriptor sets! ...blocking (slow path)"); + vk_poll_commands(vk, 10000000); // 10 ms + } + } + + static const enum queue_type types[] = { + [PL_PASS_RASTER] = GRAPHICS, + [PL_PASS_COMPUTE] = COMPUTE, + }; + + struct vk_cmd *cmd = CMD_BEGIN_TIMED(types[pass->params.type], params->timer); + if (!cmd) + goto error; + + // Find a descriptor set to use + VkDescriptorSet ds = VK_NULL_HANDLE; + if (!pass_vk->use_pushd) { + for (int i = 0; i < PL_ARRAY_SIZE(pass_vk->dss); i++) { + uint16_t dsbit = 1u << i; + if (pass_vk->dmask & dsbit) { + ds = pass_vk->dss[i]; + pass_vk->dmask &= ~dsbit; // unset + vk_cmd_callback(cmd, (vk_cb) set_ds, pass_vk, + (void *)(uintptr_t) dsbit); + break; + } + } + } + + // Update the dswrite structure with all of the new values + for (int i = 0; i < pass->params.num_descriptors; i++) + vk_update_descriptor(gpu, cmd, pass, params->desc_bindings[i], ds, i); + + if (!pass_vk->use_pushd) { + vk->UpdateDescriptorSets(vk->dev, pass->params.num_descriptors, + pass_vk->dswrite, 0, NULL); + } + + // Bind the pipeline, descriptor set, etc. + static const VkPipelineBindPoint bindPoint[] = { + [PL_PASS_RASTER] = VK_PIPELINE_BIND_POINT_GRAPHICS, + [PL_PASS_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE, + }; + + vk->CmdBindPipeline(cmd->buf, bindPoint[pass->params.type], + PL_DEF(pass_vk->pipe, pass_vk->base)); + + if (ds) { + vk->CmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type], + pass_vk->pipeLayout, 0, 1, &ds, 0, NULL); + } + + if (pass_vk->use_pushd) { + vk->CmdPushDescriptorSetKHR(cmd->buf, bindPoint[pass->params.type], + pass_vk->pipeLayout, 0, + pass->params.num_descriptors, + pass_vk->dswrite); + } + + if (pass->params.push_constants_size) { + vk->CmdPushConstants(cmd->buf, pass_vk->pipeLayout, + stageFlags[pass->params.type], 0, + pass->params.push_constants_size, + params->push_constants); + } + + switch (pass->params.type) { + case PL_PASS_RASTER: { + pl_tex tex = params->target; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + pl_buf vert = params->vertex_buf; + struct pl_buf_vk *vert_vk = PL_PRIV(vert); + pl_buf index = params->index_buf; + struct pl_buf_vk *index_vk = index ? PL_PRIV(index) : NULL; + pl_assert(vert); + + // In the edge case that vert = index buffer, we need to synchronize + // for both flags simultaneously + VkPipelineStageFlags2 vbo_stage = VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT; + VkAccessFlags2 vbo_flags = VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT; + if (index == vert) { + vbo_stage |= VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT; + vbo_flags |= VK_ACCESS_2_INDEX_READ_BIT; + } + + vk_buf_barrier(gpu, cmd, vert, vbo_stage, vbo_flags, 0, vert->params.size, false); + + VkDeviceSize offset = vert_vk->mem.offset + params->buf_offset; + vk->CmdBindVertexBuffers(cmd->buf, 0, 1, &vert_vk->mem.buf, &offset); + + if (index) { + if (index != vert) { + vk_buf_barrier(gpu, cmd, index, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT, + VK_ACCESS_2_INDEX_READ_BIT, 0, index->params.size, + false); + } + + static const VkIndexType index_fmts[PL_INDEX_FORMAT_COUNT] = { + [PL_INDEX_UINT16] = VK_INDEX_TYPE_UINT16, + [PL_INDEX_UINT32] = VK_INDEX_TYPE_UINT32, + }; + + vk->CmdBindIndexBuffer(cmd->buf, index_vk->mem.buf, + index_vk->mem.offset + params->index_offset, + index_fmts[params->index_fmt]); + } + + + VkAccessFlags2 fbo_access = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT; + if (pass->params.load_target) + fbo_access |= VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT; + + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + fbo_access, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + VkViewport viewport = { + .x = params->viewport.x0, + .y = params->viewport.y0, + .width = pl_rect_w(params->viewport), + .height = pl_rect_h(params->viewport), + }; + + VkRect2D scissor = { + .offset = {params->scissors.x0, params->scissors.y0}, + .extent = {pl_rect_w(params->scissors), pl_rect_h(params->scissors)}, + }; + + vk->CmdSetViewport(cmd->buf, 0, 1, &viewport); + vk->CmdSetScissor(cmd->buf, 0, 1, &scissor); + + VkRenderPassBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = pass_vk->renderPass, + .framebuffer = tex_vk->framebuffer, + .renderArea.extent = {tex->params.w, tex->params.h}, + }; + + vk->CmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE); + + if (index) { + vk->CmdDrawIndexed(cmd->buf, params->vertex_count, 1, 0, 0, 0); + } else { + vk->CmdDraw(cmd->buf, params->vertex_count, 1, 0, 0); + } + + vk->CmdEndRenderPass(cmd->buf); + break; + } + case PL_PASS_COMPUTE: + vk->CmdDispatch(cmd->buf, params->compute_groups[0], + params->compute_groups[1], + params->compute_groups[2]); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + }; + + for (int i = 0; i < pass->params.num_descriptors; i++) + vk_release_descriptor(gpu, cmd, pass, params->desc_bindings[i], i); + + // submit this command buffer for better intra-frame granularity + CMD_SUBMIT(&cmd); + +error: + return; +} diff --git a/src/vulkan/gpu_tex.c b/src/vulkan/gpu_tex.c new file mode 100644 index 0000000..7ab83b7 --- /dev/null +++ b/src/vulkan/gpu_tex.c @@ -0,0 +1,1453 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" + +void vk_tex_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_tex tex, + VkPipelineStageFlags2 stage, VkAccessFlags2 access, + VkImageLayout layout, uint32_t qf) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + pl_rc_ref(&tex_vk->rc); + pl_assert(!tex_vk->held); + pl_assert(!tex_vk->num_planes); + + // CONCURRENT images require transitioning to/from IGNORED, EXCLUSIVE + // images require transitioning to/from the concrete QF index + if (vk->pools.num == 1) { + if (tex_vk->qf == VK_QUEUE_FAMILY_IGNORED) + tex_vk->qf = cmd->pool->qf; + if (qf == VK_QUEUE_FAMILY_IGNORED) + qf = cmd->pool->qf; + } + + struct vk_sync_scope last; + bool is_trans = layout != tex_vk->layout, is_xfer = qf != tex_vk->qf; + last = vk_sem_barrier(cmd, &tex_vk->sem, stage, access, is_trans || is_xfer); + + VkImageMemoryBarrier2 barr = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = last.stage, + .srcAccessMask = last.access, + .dstStageMask = stage, + .dstAccessMask = access, + .oldLayout = tex_vk->layout, + .newLayout = layout, + .srcQueueFamilyIndex = tex_vk->qf, + .dstQueueFamilyIndex = qf, + .image = tex_vk->img, + .subresourceRange = { + .aspectMask = tex_vk->aspect, + .levelCount = 1, + .layerCount = 1, + }, + }; + + if (tex_vk->may_invalidate) { + tex_vk->may_invalidate = false; + barr.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + } + + if (last.access || is_trans || is_xfer) { + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .imageMemoryBarrierCount = 1, + .pImageMemoryBarriers = &barr, + }); + } + + tex_vk->qf = qf; + tex_vk->layout = layout; + vk_cmd_callback(cmd, (vk_cb) vk_tex_deref, gpu, tex); + + for (int i = 0; i < tex_vk->ext_deps.num; i++) + vk_cmd_dep(cmd, stage, tex_vk->ext_deps.elem[i]); + tex_vk->ext_deps.num = 0; + + if (tex_vk->ext_sync) { + vk_cmd_callback(cmd, (vk_cb) vk_sync_deref, gpu, tex_vk->ext_sync); + tex_vk->ext_sync = NULL; + } +} + +static void vk_tex_destroy(pl_gpu gpu, struct pl_tex_t *tex) +{ + if (!tex) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + vk_sync_deref(gpu, tex_vk->ext_sync); + vk->DestroyFramebuffer(vk->dev, tex_vk->framebuffer, PL_VK_ALLOC); + vk->DestroyImageView(vk->dev, tex_vk->view, PL_VK_ALLOC); + for (int i = 0; i < tex_vk->num_planes; i++) + vk_tex_deref(gpu, tex->planes[i]); + if (!tex_vk->external_img) { + vk->DestroyImage(vk->dev, tex_vk->img, PL_VK_ALLOC); + vk_malloc_free(vk->ma, &tex_vk->mem); + } + + pl_free(tex); +} + +void vk_tex_deref(pl_gpu gpu, pl_tex tex) +{ + if (!tex) + return; + + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + if (pl_rc_deref(&tex_vk->rc)) + vk_tex_destroy(gpu, (struct pl_tex_t *) tex); +} + + +// Initializes non-VkImage values like the image view, framebuffers, etc. +static bool vk_init_image(pl_gpu gpu, pl_tex tex, pl_debug_tag debug_tag) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + const struct pl_tex_params *params = &tex->params; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + pl_assert(tex_vk->img); + PL_VK_NAME(IMAGE, tex_vk->img, debug_tag); + pl_rc_init(&tex_vk->rc); + if (tex_vk->num_planes) + return true; + tex_vk->layout = VK_IMAGE_LAYOUT_UNDEFINED; + tex_vk->transfer_queue = GRAPHICS; + tex_vk->qf = VK_QUEUE_FAMILY_IGNORED; // will be set on first use, if needed + + // Always use the transfer pool if available, for efficiency + if ((params->host_writable || params->host_readable) && vk->pool_transfer) + tex_vk->transfer_queue = TRANSFER; + + // For emulated formats: force usage of the compute queue, because we + // can't properly track cross-queue dependencies for buffers (yet?) + if (params->format->emulated) + tex_vk->transfer_queue = COMPUTE; + + bool ret = false; + VkRenderPass dummyPass = VK_NULL_HANDLE; + + if (params->sampleable || params->renderable || params->storable) { + static const VkImageViewType viewType[] = { + [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D, + [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D, + [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D, + }; + + const VkImageViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = tex_vk->img, + .viewType = viewType[tex_vk->type], + .format = tex_vk->img_fmt, + .subresourceRange = { + .aspectMask = tex_vk->aspect, + .levelCount = 1, + .layerCount = 1, + }, + }; + + VK(vk->CreateImageView(vk->dev, &vinfo, PL_VK_ALLOC, &tex_vk->view)); + PL_VK_NAME(IMAGE_VIEW, tex_vk->view, debug_tag); + } + + if (params->renderable) { + // Framebuffers need to be created against a specific render pass + // layout, so we need to temporarily create a skeleton/dummy render + // pass for vulkan to figure out the compatibility + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = tex_vk->img_fmt, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &dummyPass)); + + VkFramebufferCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .renderPass = dummyPass, + .attachmentCount = 1, + .pAttachments = &tex_vk->view, + .width = tex->params.w, + .height = tex->params.h, + .layers = 1, + }; + + if (finfo.width > vk->props.limits.maxFramebufferWidth || + finfo.height > vk->props.limits.maxFramebufferHeight) + { + PL_ERR(gpu, "Framebuffer of size %dx%d exceeds the maximum allowed " + "dimensions: %dx%d", finfo.width, finfo.height, + vk->props.limits.maxFramebufferWidth, + vk->props.limits.maxFramebufferHeight); + goto error; + } + + VK(vk->CreateFramebuffer(vk->dev, &finfo, PL_VK_ALLOC, + &tex_vk->framebuffer)); + PL_VK_NAME(FRAMEBUFFER, tex_vk->framebuffer, debug_tag); + } + + ret = true; + +error: + vk->DestroyRenderPass(vk->dev, dummyPass, PL_VK_ALLOC); + return ret; +} + +pl_tex vk_tex_create(pl_gpu gpu, const struct pl_tex_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + enum pl_handle_type handle_type = params->export_handle | + params->import_handle; + VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type = vk_mem_handle_type(handle_type); + + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk); + pl_fmt fmt = params->format; + tex->params = *params; + tex->params.initial_data = NULL; + tex->sampler_type = PL_SAMPLER_NORMAL; + + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_fmt_vk *fmtp = PL_PRIV(fmt); + tex_vk->img_fmt = fmtp->vk_fmt->tfmt; + tex_vk->num_planes = fmt->num_planes; + for (int i = 0; i < tex_vk->num_planes; i++) + tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i; + tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT); + + switch (pl_tex_params_dimension(*params)) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + } + + if (fmt->emulated) { + tex_vk->texel_fmt = pl_find_fmt(gpu, fmt->type, 1, 0, + fmt->host_bits[0], + PL_FMT_CAP_TEXEL_UNIFORM); + if (!tex_vk->texel_fmt) { + PL_ERR(gpu, "Failed picking texel format for emulated texture!"); + goto error; + } + + // Our format emulation requires storage image support. In order to + // make a bunch of checks happy, just mark it off as storable (and also + // enable VK_IMAGE_USAGE_STORAGE_BIT, which we do below) + tex->params.storable = true; + } + + if (fmtp->blit_emulated) { + // Enable what's required for sampling + tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE; + tex->params.storable = true; + } + + // Blit emulation on planar textures requires storage + if ((params->blit_src || params->blit_dst) && tex_vk->num_planes) + tex->params.storable = true; + + VkImageUsageFlags usage = 0; + VkImageCreateFlags flags = 0; + if (tex->params.sampleable) + usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (tex->params.renderable) + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + if (tex->params.storable) + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + if (tex->params.host_readable || tex->params.blit_src) + usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + if (tex->params.host_writable || tex->params.blit_dst || params->initial_data) + usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + if (!usage) { + // Vulkan requires images have at least *some* image usage set, but our + // API is perfectly happy with a (useless) image. So just put + // VK_IMAGE_USAGE_TRANSFER_DST_BIT since this harmless. + usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT; + } + + if (tex_vk->num_planes) { + flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT | + VK_IMAGE_CREATE_EXTENDED_USAGE_BIT; + } + + // FIXME: Since we can't keep track of queue family ownership properly, + // and we don't know in advance what types of queue families this image + // will belong to, we're forced to share all of our images between all + // command pools. + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkImageDrmFormatModifierExplicitCreateInfoEXT drm_explicit = { + .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT, + .drmFormatModifier = params->shared_mem.drm_format_mod, + .drmFormatModifierPlaneCount = 1, + .pPlaneLayouts = &(VkSubresourceLayout) { + .rowPitch = PL_DEF(params->shared_mem.stride_w, params->w), + .depthPitch = params->d ? PL_DEF(params->shared_mem.stride_h, params->h) : 0, + .offset = params->shared_mem.offset, + }, + }; + +#ifdef VK_EXT_metal_objects + VkImportMetalTextureInfoEXT import_metal_tex = { + .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_TEXTURE_INFO_EXT, + .plane = VK_IMAGE_ASPECT_PLANE_0_BIT << params->shared_mem.plane, + }; + + VkImportMetalIOSurfaceInfoEXT import_iosurface = { + .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_IO_SURFACE_INFO_EXT, + }; +#endif + + VkImageDrmFormatModifierListCreateInfoEXT drm_list = { + .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT, + .drmFormatModifierCount = fmt->num_modifiers, + .pDrmFormatModifiers = fmt->modifiers, + }; + + VkExternalMemoryImageCreateInfoKHR ext_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR, + .handleTypes = vk_handle_type, + }; + + VkImageCreateInfo iinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = vk_handle_type ? &ext_info : NULL, + .imageType = tex_vk->type, + .format = tex_vk->img_fmt, + .extent = (VkExtent3D) { + .width = params->w, + .height = PL_MAX(1, params->h), + .depth = PL_MAX(1, params->d) + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage, + .flags = flags, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + struct vk_malloc_params mparams = { + .optimal = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + .export_handle = params->export_handle, + .import_handle = params->import_handle, + .shared_mem = params->shared_mem, + .debug_tag = params->debug_tag, + }; + + if (params->import_handle == PL_HANDLE_DMA_BUF) { + vk_link_struct(&iinfo, &drm_explicit); + iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; + mparams.shared_mem.offset = 0x0; // handled via plane offsets + } + +#ifdef VK_EXT_metal_objects + if (params->import_handle == PL_HANDLE_MTL_TEX) { + vk_link_struct(&iinfo, &import_metal_tex); + import_metal_tex.mtlTexture = params->shared_mem.handle.handle; + } + + if (params->import_handle == PL_HANDLE_IOSURFACE) { + vk_link_struct(&iinfo, &import_iosurface); + import_iosurface.ioSurface = params->shared_mem.handle.handle; + } +#endif + + if (params->export_handle == PL_HANDLE_DMA_BUF) { + pl_assert(drm_list.drmFormatModifierCount > 0); + vk_link_struct(&iinfo, &drm_list); + iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; + } + + // Double-check physical image format limits and fail if invalid + VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT, + .sharingMode = iinfo.sharingMode, + .queueFamilyIndexCount = iinfo.queueFamilyIndexCount, + .pQueueFamilyIndices = iinfo.pQueueFamilyIndices, + }; + + VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR, + .handleType = ext_info.handleTypes, + }; + + if (handle_type == PL_HANDLE_DMA_BUF) { + if (params->import_handle) { + // On import, we know exactly which format modifier to test + drm_pinfo.drmFormatModifier = drm_explicit.drmFormatModifier; + } else { + // On export, the choice of format modifier is ambiguous, because + // we offer the implementation a whole list to choose from. In + // principle, we must check *all* supported drm format modifiers, + // but in practice it should hopefully suffice to just check one + drm_pinfo.drmFormatModifier = drm_list.pDrmFormatModifiers[0]; + } + vk_link_struct(&ext_pinfo, &drm_pinfo); + } + + VkPhysicalDeviceImageFormatInfo2KHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR, + .pNext = vk_handle_type ? &ext_pinfo : NULL, + .format = iinfo.format, + .type = iinfo.imageType, + .tiling = iinfo.tiling, + .usage = iinfo.usage, + .flags = iinfo.flags, + }; + + VkExternalImageFormatPropertiesKHR ext_props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR, + }; + + VkImageFormatProperties2KHR props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR, + .pNext = vk_handle_type ? &ext_props : NULL, + }; + + VkResult res; + res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props); + if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) { + PL_DEBUG(gpu, "Texture creation failed: not supported"); + goto error; + } else { + PL_VK_ASSERT(res, "Querying image format properties"); + } + + VkExtent3D max = props.imageFormatProperties.maxExtent; + if (params->w > max.width || params->h > max.height || params->d > max.depth) + { + PL_ERR(gpu, "Requested image size %dx%dx%d exceeds the maximum allowed " + "dimensions %dx%dx%d for vulkan image format %x", + params->w, params->h, params->d, max.width, max.height, max.depth, + (unsigned) iinfo.format); + goto error; + } + + // Ensure the handle type is supported + if (vk_handle_type) { + bool ok = vk_external_mem_check(vk, &ext_props.externalMemoryProperties, + handle_type, params->import_handle); + if (!ok) { + PL_ERR(gpu, "Requested handle type is not compatible with the " + "specified combination of image parameters. Possibly the " + "handle type is unsupported altogether?"); + goto error; + } + } + + VK(vk->CreateImage(vk->dev, &iinfo, PL_VK_ALLOC, &tex_vk->img)); + tex_vk->usage_flags = iinfo.usage; + + VkMemoryDedicatedRequirements ded_reqs = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR, + }; + + VkMemoryRequirements2 reqs = { + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR, + .pNext = &ded_reqs, + }; + + VkImageMemoryRequirementsInfo2 req_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR, + .image = tex_vk->img, + }; + + vk->GetImageMemoryRequirements2(vk->dev, &req_info, &reqs); + mparams.reqs = reqs.memoryRequirements; + if (ded_reqs.prefersDedicatedAllocation) { + mparams.ded_image = tex_vk->img; + if (vk_mem_handle_type(params->import_handle)) + mparams.shared_mem.size = reqs.memoryRequirements.size; + } + + const char *debug_tag = params->debug_tag ? params->debug_tag : + params->import_handle ? "imported" : "created"; + + if (!params->import_handle || vk_mem_handle_type(params->import_handle)) { + struct vk_memslice *mem = &tex_vk->mem; + if (!vk_malloc_slice(vk->ma, mem, &mparams)) + goto error; + + VK(vk->BindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset)); + } + + static const char * const plane_names[4] = { + "plane 0", "plane 1", "plane 2", "plane 3", + }; + + if (tex_vk->num_planes) { + for (int i = 0; i < tex_vk->num_planes; i++) { + struct pl_tex_t *plane; + + pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D); + plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params( + .image = tex_vk->img, + .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i, + .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x), + .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y), + .format = fmtp->vk_fmt->pfmt[i].fmt, + .usage = usage, + .user_data = params->user_data, + .debug_tag = PL_DEF(params->debug_tag, plane_names[i]), + )); + if (!plane) + goto error; + plane->parent = tex; + tex->planes[i] = plane; + tex_vk->planes[i] = PL_PRIV(plane); + tex_vk->planes[i]->held = false; + tex_vk->planes[i]->layout = tex_vk->layout; + } + + // Explicitly mask out all usage flags from planar parent images + pl_assert(!fmt->caps); + tex->params.sampleable = false; + tex->params.renderable = false; + tex->params.storable = false; + tex->params.blit_src = false; + tex->params.blit_dst = false; + tex->params.host_writable = false; + tex->params.host_readable = false; + } + + if (!vk_init_image(gpu, tex, debug_tag)) + goto error; + + if (params->export_handle) + tex->shared_mem = tex_vk->mem.shared_mem; + + if (params->export_handle == PL_HANDLE_DMA_BUF) { + if (vk->GetImageDrmFormatModifierPropertiesEXT) { + + // Query the DRM format modifier and plane layout from the driver + VkImageDrmFormatModifierPropertiesEXT mod_props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT, + }; + + VK(vk->GetImageDrmFormatModifierPropertiesEXT(vk->dev, tex_vk->img, &mod_props)); + tex->shared_mem.drm_format_mod = mod_props.drmFormatModifier; + + VkSubresourceLayout layout = {0}; + VkImageSubresource plane = { + .aspectMask = VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT, + }; + + vk->GetImageSubresourceLayout(vk->dev, tex_vk->img, &plane, &layout); + if (layout.offset != 0) { + PL_ERR(gpu, "Exported DRM plane 0 has nonzero offset %zu, " + "this should never happen! Erroring for safety...", + (size_t) layout.offset); + goto error; + } + tex->shared_mem.stride_w = layout.rowPitch; + tex->shared_mem.stride_h = layout.depthPitch; + + } else { + + // Fallback for no modifiers, just do something stupid. + tex->shared_mem.drm_format_mod = DRM_FORMAT_MOD_INVALID; + tex->shared_mem.stride_w = params->w; + tex->shared_mem.stride_h = params->h; + + } + } + + if (params->initial_data) { + struct pl_tex_transfer_params ul_params = { + .tex = tex, + .ptr = (void *) params->initial_data, + .rc = { 0, 0, 0, params->w, params->h, params->d }, + }; + + // Since we re-use GPU helpers which require writable images, just fake it + bool writable = tex->params.host_writable; + tex->params.host_writable = true; + if (!pl_tex_upload(gpu, &ul_params)) + goto error; + tex->params.host_writable = writable; + } + + return tex; + +error: + vk_tex_destroy(gpu, tex); + return NULL; +} + +void vk_tex_invalidate(pl_gpu gpu, pl_tex tex) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + tex_vk->may_invalidate = true; + for (int i = 0; i < tex_vk->num_planes; i++) + tex_vk->planes[i]->may_invalidate = true; +} + +static bool tex_clear_fallback(pl_gpu gpu, pl_tex tex, + const union pl_clear_color color) +{ + pl_tex pixel = pl_tex_create(gpu, pl_tex_params( + .w = 1, + .h = 1, + .format = tex->params.format, + .storable = true, + .blit_src = true, + .blit_dst = true, + )); + if (!pixel) + return false; + + pl_tex_clear_ex(gpu, pixel, color); + + pl_assert(tex->params.storable); + pl_tex_blit(gpu, pl_tex_blit_params( + .src = pixel, + .dst = tex, + .sample_mode = PL_TEX_SAMPLE_NEAREST, + )); + + pl_tex_destroy(gpu, &pixel); + return true; +} + +void vk_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) { + if (!tex_clear_fallback(gpu, tex, color)) { + PL_ERR(gpu, "Failed clearing imported planar image: color aspect " + "clears disallowed by spec and no shader fallback " + "available"); + } + return; + } + + struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS); + if (!cmd) + return; + + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_CLEAR_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + pl_static_assert(sizeof(VkClearColorValue) == sizeof(union pl_clear_color)); + const VkClearColorValue *clearColor = (const VkClearColorValue *) &color; + + pl_assert(tex_vk->aspect == VK_IMAGE_ASPECT_COLOR_BIT); + static const VkImageSubresourceRange range = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + }; + + vk->CmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->layout, + clearColor, 1, &range); + + CMD_FINISH(&cmd); +} + +void vk_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *src_vk = PL_PRIV(params->src); + struct pl_tex_vk *dst_vk = PL_PRIV(params->dst); + struct pl_fmt_vk *src_fmtp = PL_PRIV(params->src->params.format); + struct pl_fmt_vk *dst_fmtp = PL_PRIV(params->dst->params.format); + bool blit_emulated = src_fmtp->blit_emulated || dst_fmtp->blit_emulated; + bool planar_fallback = src_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT || + dst_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT; + + pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc; + bool requires_scaling = !pl_rect3d_eq(src_rc, dst_rc); + if ((requires_scaling && blit_emulated) || planar_fallback) { + if (!pl_tex_blit_compute(gpu, params)) + PL_ERR(gpu, "Failed emulating texture blit, incompatible textures?"); + return; + } + + struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS); + if (!cmd) + return; + + // When the blit operation doesn't require scaling, we can use the more + // efficient vkCmdCopyImage instead of vkCmdBlitImage + if (!requires_scaling) { + vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + pl_rect3d_normalize(&src_rc); + + VkImageCopy region = { + .srcSubresource = { + .aspectMask = src_vk->aspect, + .layerCount = 1, + }, + .dstSubresource = { + .aspectMask = dst_vk->aspect, + .layerCount = 1, + }, + .srcOffset = {src_rc.x0, src_rc.y0, src_rc.z0}, + .dstOffset = {src_rc.x0, src_rc.y0, src_rc.z0}, + .extent = { + pl_rect_w(src_rc), + pl_rect_h(src_rc), + pl_rect_d(src_rc), + }, + }; + + vk->CmdCopyImage(cmd->buf, src_vk->img, src_vk->layout, + dst_vk->img, dst_vk->layout, 1, ®ion); + } else { + vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_BLIT_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_BLIT_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + VkImageBlit region = { + .srcSubresource = { + .aspectMask = src_vk->aspect, + .layerCount = 1, + }, + .dstSubresource = { + .aspectMask = dst_vk->aspect, + .layerCount = 1, + }, + .srcOffsets = {{src_rc.x0, src_rc.y0, src_rc.z0}, + {src_rc.x1, src_rc.y1, src_rc.z1}}, + .dstOffsets = {{dst_rc.x0, dst_rc.y0, dst_rc.z0}, + {dst_rc.x1, dst_rc.y1, dst_rc.z1}}, + }; + + static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = { + [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST, + [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR, + }; + + vk->CmdBlitImage(cmd->buf, src_vk->img, src_vk->layout, + dst_vk->img, dst_vk->layout, 1, ®ion, + filters[params->sample_mode]); + } + + CMD_FINISH(&cmd); +} + +// Determine the best queue type to perform a buffer<->image copy on +static enum queue_type vk_img_copy_queue(pl_gpu gpu, pl_tex tex, + const struct VkBufferImageCopy *region) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + const struct pl_tex_vk *tex_vk = PL_PRIV(tex); + enum queue_type queue = tex_vk->transfer_queue; + if (queue != TRANSFER) + return queue; + + VkExtent3D alignment = vk->pool_transfer->props.minImageTransferGranularity; + + enum queue_type fallback = GRAPHICS; + if (gpu->limits.compute_queues > gpu->limits.fragment_queues) + fallback = COMPUTE; // prefer async compute queue + + int tex_w = PL_DEF(tex->params.w, 1), + tex_h = PL_DEF(tex->params.h, 1), + tex_d = PL_DEF(tex->params.d, 1); + + bool full_w = region->imageOffset.x + region->imageExtent.width == tex_w, + full_h = region->imageOffset.y + region->imageExtent.height == tex_h, + full_d = region->imageOffset.z + region->imageExtent.depth == tex_d; + + if (alignment.width) { + + bool unaligned = false; + unaligned |= region->imageOffset.x % alignment.width; + unaligned |= region->imageOffset.y % alignment.height; + unaligned |= region->imageOffset.z % alignment.depth; + unaligned |= (region->imageExtent.width % alignment.width) && !full_w; + unaligned |= (region->imageExtent.height % alignment.height) && !full_h; + unaligned |= (region->imageExtent.depth % alignment.depth) && !full_d; + + return unaligned ? fallback : queue; + + } else { + + // an alignment of {0} means the copy must span the entire image + bool unaligned = false; + unaligned |= region->imageOffset.x || !full_w; + unaligned |= region->imageOffset.y || !full_h; + unaligned |= region->imageOffset.z || !full_d; + + return unaligned ? fallback : queue; + + } +} + +static void tex_xfer_cb(void *ctx, void *arg) +{ + void (*fun)(void *priv) = ctx; + fun(arg); +} + +bool vk_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_tex_transfer_params *slices = NULL; + int num_slices = 0; + + if (!params->buf) + return pl_tex_upload_pbo(gpu, params); + + pl_buf buf = params->buf; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_rect3d rc = params->rc; + const size_t size = pl_tex_transfer_size(params); + const size_t buf_offset = buf_vk->mem.offset + params->buf_offset; + bool unaligned = buf_offset % fmt->texel_size; + if (unaligned) + PL_TRACE(gpu, "vk_tex_upload: unaligned transfer (slow path)"); + + if (fmt->emulated || unaligned) { + + // Create all slice buffers first, to early-fail if OOM, and to avoid + // blocking unnecessarily on waiting for these buffers to get read from + num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices); + for (int i = 0; i < num_slices; i++) { + slices[i].buf = pl_buf_create(gpu, pl_buf_params( + .memory_type = PL_BUF_MEM_DEVICE, + .format = tex_vk->texel_fmt, + .size = pl_tex_transfer_size(&slices[i]), + .storable = fmt->emulated, + )); + + if (!slices[i].buf) { + PL_ERR(gpu, "Failed creating buffer for tex upload fallback!"); + num_slices = i; // only clean up buffers up to here + goto error; + } + } + + // All temporary buffers successfully created, begin copying source data + struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, + params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size, + false); + + for (int i = 0; i < num_slices; i++) { + pl_buf slice = slices[i].buf; + struct pl_buf_vk *slice_vk = PL_PRIV(slice); + vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, 0, slice->params.size, + false); + + vk->CmdCopyBuffer(cmd->buf, buf_vk->mem.buf, slice_vk->mem.buf, 1, &(VkBufferCopy) { + .srcOffset = buf_vk->mem.offset + slices[i].buf_offset, + .dstOffset = slice_vk->mem.offset, + .size = slice->params.size, + }); + } + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + bool ok = CMD_FINISH(&cmd); + + // Finally, dispatch the (texel) upload asynchronously. We can fire + // the callback already at the completion of previous command because + // these temporary buffers already hold persistent copies of the data + for (int i = 0; i < num_slices; i++) { + if (ok) { + slices[i].buf_offset = 0; + ok = fmt->emulated ? pl_tex_upload_texel(gpu, &slices[i]) + : pl_tex_upload(gpu, &slices[i]); + } + pl_buf_destroy(gpu, &slices[i].buf); + } + + pl_free(slices); + return ok; + + } else { + + pl_assert(fmt->texel_align == fmt->texel_size); + const VkBufferImageCopy region = { + .bufferOffset = buf_offset, + .bufferRowLength = params->row_pitch / fmt->texel_size, + .bufferImageHeight = params->depth_pitch / params->row_pitch, + .imageOffset = { rc.x0, rc.y0, rc.z0 }, + .imageExtent = { rc.x1, rc.y1, rc.z1 }, + .imageSubresource = { + .aspectMask = tex_vk->aspect, + .layerCount = 1, + }, + }; + + enum queue_type queue = vk_img_copy_queue(gpu, tex, ®ion); + struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size, + false); + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdCopyBufferToImage(cmd->buf, buf_vk->mem.buf, tex_vk->img, + tex_vk->layout, 1, ®ion); + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + return CMD_FINISH(&cmd); + } + + pl_unreachable(); + +error: + for (int i = 0; i < num_slices; i++) + pl_buf_destroy(gpu, &slices[i].buf); + pl_free(slices); + return false; +} + +bool vk_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_tex_transfer_params *slices = NULL; + int num_slices = 0; + + if (!params->buf) + return pl_tex_download_pbo(gpu, params); + + pl_buf buf = params->buf; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_rect3d rc = params->rc; + const size_t size = pl_tex_transfer_size(params); + const size_t buf_offset = buf_vk->mem.offset + params->buf_offset; + bool unaligned = buf_offset % fmt->texel_size; + if (unaligned) + PL_TRACE(gpu, "vk_tex_download: unaligned transfer (slow path)"); + + if (fmt->emulated || unaligned) { + + num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices); + for (int i = 0; i < num_slices; i++) { + slices[i].buf = pl_buf_create(gpu, pl_buf_params( + .memory_type = PL_BUF_MEM_DEVICE, + .format = tex_vk->texel_fmt, + .size = pl_tex_transfer_size(&slices[i]), + .storable = fmt->emulated, + )); + + if (!slices[i].buf) { + PL_ERR(gpu, "Failed creating buffer for tex download fallback!"); + num_slices = i; + goto error; + } + } + + for (int i = 0; i < num_slices; i++) { + // Restore buffer offset after downloading into temporary buffer, + // because we still need to copy the data from the temporary buffer + // into this offset in the original buffer + const size_t tmp_offset = slices[i].buf_offset; + slices[i].buf_offset = 0; + bool ok = fmt->emulated ? pl_tex_download_texel(gpu, &slices[i]) + : pl_tex_download(gpu, &slices[i]); + slices[i].buf_offset = tmp_offset; + if (!ok) + goto error; + } + + // Finally, download into the user buffer + struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size, + false); + + for (int i = 0; i < num_slices; i++) { + pl_buf slice = slices[i].buf; + struct pl_buf_vk *slice_vk = PL_PRIV(slice); + vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, 0, slice->params.size, + false); + + vk->CmdCopyBuffer(cmd->buf, slice_vk->mem.buf, buf_vk->mem.buf, 1, &(VkBufferCopy) { + .srcOffset = slice_vk->mem.offset, + .dstOffset = buf_vk->mem.offset + slices[i].buf_offset, + .size = slice->params.size, + }); + + pl_buf_destroy(gpu, &slices[i].buf); + } + + vk_buf_flush(gpu, cmd, buf, params->buf_offset, size); + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + pl_free(slices); + return CMD_FINISH(&cmd); + + } else { + + pl_assert(params->row_pitch % fmt->texel_size == 0); + pl_assert(params->depth_pitch % params->row_pitch == 0); + const VkBufferImageCopy region = { + .bufferOffset = buf_offset, + .bufferRowLength = params->row_pitch / fmt->texel_size, + .bufferImageHeight = params->depth_pitch / params->row_pitch, + .imageOffset = { rc.x0, rc.y0, rc.z0 }, + .imageExtent = { rc.x1, rc.y1, rc.z1 }, + .imageSubresource = { + .aspectMask = tex_vk->aspect, + .layerCount = 1, + }, + }; + + enum queue_type queue = vk_img_copy_queue(gpu, tex, ®ion); + + struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size, + false); + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdCopyImageToBuffer(cmd->buf, tex_vk->img, tex_vk->layout, + buf_vk->mem.buf, 1, ®ion); + vk_buf_flush(gpu, cmd, buf, params->buf_offset, size); + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + return CMD_FINISH(&cmd); + } + + pl_unreachable(); + +error: + for (int i = 0; i < num_slices; i++) + pl_buf_destroy(gpu, &slices[i].buf); + pl_free(slices); + return false; +} + +bool vk_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + // Opportunistically check if we can re-use this texture without flush + vk_poll_commands(vk, 0); + if (pl_rc_count(&tex_vk->rc) == 1) + goto skip_blocking; + + // Otherwise, we're force to submit any queued command so that the user is + // guaranteed to see progress eventually, even if they call this in a loop + CMD_SUBMIT(NULL); + vk_poll_commands(vk, timeout); + if (pl_rc_count(&tex_vk->rc) > 1) + return true; + + // fall through +skip_blocking: + for (int i = 0; i < tex_vk->num_planes; i++) { + if (vk_tex_poll(gpu, tex->planes[i], timeout)) + return true; + } + + return false; +} + +bool vk_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + + if (tex_vk->num_planes) { + PL_ERR(gpu, "`pl_tex_export` cannot be called on planar textures." + "Please see `pl_vulkan_hold_ex` for a replacement."); + return false; + } + + struct vk_cmd *cmd = CMD_BEGIN(ANY); + if (!cmd) + goto error; + + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_NONE, + 0, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_EXTERNAL); + + // Make the next barrier appear as though coming from a different queue + tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL; + + vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, (pl_vulkan_sem){ sync_vk->wait }); + if (!CMD_SUBMIT(&cmd)) + goto error; + + // Remember the other dependency and hold on to the sync object + PL_ARRAY_APPEND(tex, tex_vk->ext_deps, (pl_vulkan_sem){ sync_vk->signal }); + pl_rc_ref(&sync_vk->rc); + tex_vk->ext_sync = sync; + tex_vk->qf = VK_QUEUE_FAMILY_EXTERNAL; + return true; + +error: + PL_ERR(gpu, "Failed exporting shared texture!"); + return false; +} + +pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params) +{ + pl_fmt fmt = NULL; + for (int i = 0; i < gpu->num_formats; i++) { + const struct vk_format **vkfmt = PL_PRIV(gpu->formats[i]); + if ((*vkfmt)->tfmt == params->format) { + fmt = gpu->formats[i]; + break; + } + } + + if (!fmt) { + PL_ERR(gpu, "Could not find pl_fmt suitable for wrapped image " + "with format %s", vk_fmt_name(params->format)); + return NULL; + } + + VkImageUsageFlags usage = params->usage; + if (fmt->num_planes) + usage = 0; // mask capabilities from the base texture + + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk); + tex->params = (struct pl_tex_params) { + .format = fmt, + .w = params->width, + .h = params->height, + .d = params->depth, + .sampleable = !!(usage & VK_IMAGE_USAGE_SAMPLED_BIT), + .renderable = !!(usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT), + .storable = !!(usage & VK_IMAGE_USAGE_STORAGE_BIT), + .blit_src = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .blit_dst = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .host_writable = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .host_readable = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .user_data = params->user_data, + .debug_tag = params->debug_tag, + }; + + // Mask out capabilities not permitted by the `pl_fmt` +#define MASK(field, cap) \ + do { \ + if (tex->params.field && !(fmt->caps & cap)) { \ + PL_WARN(gpu, "Masking `" #field "` from wrapped texture because " \ + "the corresponding format '%s' does not support " #cap, \ + fmt->name); \ + tex->params.field = false; \ + } \ + } while (0) + + MASK(sampleable, PL_FMT_CAP_SAMPLEABLE); + MASK(renderable, PL_FMT_CAP_RENDERABLE); + MASK(storable, PL_FMT_CAP_STORABLE); + MASK(blit_src, PL_FMT_CAP_BLITTABLE); + MASK(blit_dst, PL_FMT_CAP_BLITTABLE); + MASK(host_readable, PL_FMT_CAP_HOST_READABLE); +#undef MASK + + // For simplicity, explicitly mask out blit emulation for wrapped textures + struct pl_fmt_vk *fmtp = PL_PRIV(fmt); + if (fmtp->blit_emulated) { + tex->params.blit_src = false; + tex->params.blit_dst = false; + } + + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + switch (pl_tex_params_dimension(tex->params)) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + } + tex_vk->external_img = true; + tex_vk->held = !fmt->num_planes; + tex_vk->img = params->image; + tex_vk->img_fmt = params->format; + tex_vk->num_planes = fmt->num_planes; + tex_vk->usage_flags = usage; + tex_vk->aspect = params->aspect; + + if (!tex_vk->aspect) { + for (int i = 0; i < tex_vk->num_planes; i++) + tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i; + tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT); + } + + // Blitting to planar images requires fallback via compute shaders + if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) { + tex->params.blit_src &= tex->params.storable; + tex->params.blit_dst &= tex->params.storable; + } + + static const char * const wrapped_plane_names[4] = { + "wrapped plane 0", "wrapped plane 1", "wrapped plane 2", "wrapped plane 3", + }; + + for (int i = 0; i < tex_vk->num_planes; i++) { + struct pl_tex_t *plane; + VkImageAspectFlags aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i; + if (!(aspect & tex_vk->aspect)) { + PL_INFO(gpu, "Not wrapping plane %d due to aspect bit 0x%x not " + "being contained in supplied params->aspect 0x%x!", + i, (unsigned) aspect, (unsigned) tex_vk->aspect); + continue; + } + + pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D); + plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params( + .image = tex_vk->img, + .aspect = aspect, + .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x), + .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y), + .format = fmtp->vk_fmt->pfmt[i].fmt, + .usage = params->usage, + .user_data = params->user_data, + .debug_tag = PL_DEF(params->debug_tag, wrapped_plane_names[i]), + )); + if (!plane) + goto error; + plane->parent = tex; + tex->planes[i] = plane; + tex_vk->planes[i] = PL_PRIV(plane); + } + + if (!vk_init_image(gpu, tex, PL_DEF(params->debug_tag, "wrapped"))) + goto error; + + return tex; + +error: + vk_tex_destroy(gpu, tex); + return NULL; +} + +VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, VkFormat *out_format, + VkImageUsageFlags *out_flags) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + if (out_format) + *out_format = tex_vk->img_fmt; + if (out_flags) + *out_flags = tex_vk->usage_flags; + + return tex_vk->img; +} + +bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(params->tex); + pl_assert(params->semaphore.sem); + + bool held = tex_vk->held; + for (int i = 0; i < tex_vk->num_planes; i++) + held |= tex_vk->planes[i]->held; + + if (held) { + PL_ERR(gpu, "Attempting to hold an already held image!"); + return false; + } + + struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS); + if (!cmd) { + PL_ERR(gpu, "Failed holding external image!"); + return false; + } + + VkImageLayout layout = params->layout; + if (params->out_layout) { + // For planar images, arbitrarily pick the current image layout of the + // first plane. This should be fine in practice, since all planes will + // share the same usage capabilities. + if (tex_vk->num_planes) { + layout = tex_vk->planes[0]->layout; + } else { + layout = tex_vk->layout; + } + } + + bool may_invalidate = true; + if (!tex_vk->num_planes) { + may_invalidate &= tex_vk->may_invalidate; + vk_tex_barrier(gpu, cmd, params->tex, VK_PIPELINE_STAGE_2_NONE, + 0, layout, params->qf); + } + + for (int i = 0; i < tex_vk->num_planes; i++) { + may_invalidate &= tex_vk->planes[i]->may_invalidate; + vk_tex_barrier(gpu, cmd, params->tex->planes[i], + VK_PIPELINE_STAGE_2_NONE, 0, layout, params->qf); + } + + vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, params->semaphore); + bool ok = CMD_SUBMIT(&cmd); + + if (!tex_vk->num_planes) { + tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL; + tex_vk->held = ok; + } + + for (int i = 0; i < tex_vk->num_planes; i++) { + struct pl_tex_vk *plane_vk = tex_vk->planes[i]; + plane_vk->sem.write.queue = plane_vk->sem.read.queue = NULL; + plane_vk->held = ok; + } + + if (ok && params->out_layout) + *params->out_layout = may_invalidate ? VK_IMAGE_LAYOUT_UNDEFINED : layout; + + return ok; +} + +void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(params->tex); + if (tex_vk->num_planes) { + struct pl_vulkan_release_params plane_pars = *params; + for (int i = 0; i < tex_vk->num_planes; i++) { + plane_pars.tex = params->tex->planes[i]; + pl_vulkan_release_ex(gpu, &plane_pars); + } + return; + } + + if (!tex_vk->held) { + PL_ERR(gpu, "Attempting to release an unheld image?"); + return; + } + + if (params->semaphore.sem) + PL_ARRAY_APPEND(params->tex, tex_vk->ext_deps, params->semaphore); + + tex_vk->qf = params->qf; + tex_vk->layout = params->layout; + tex_vk->held = false; +} + +bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout, + pl_vulkan_sem sem_out) +{ + return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = tex, + .layout = layout, + .semaphore = sem_out, + .qf = VK_QUEUE_FAMILY_IGNORED, + )); +} + +bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex, + VkImageLayout *out_layout, + pl_vulkan_sem sem_out) +{ + return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = tex, + .out_layout = out_layout, + .semaphore = sem_out, + .qf = VK_QUEUE_FAMILY_IGNORED, + )); +} + +void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout, + pl_vulkan_sem sem_in) +{ + pl_vulkan_release_ex(gpu, pl_vulkan_release_params( + .tex = tex, + .layout = layout, + .semaphore = sem_in, + .qf = VK_QUEUE_FAMILY_IGNORED, + )); +} diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c new file mode 100644 index 0000000..c35183b --- /dev/null +++ b/src/vulkan/malloc.c @@ -0,0 +1,1058 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "malloc.h" +#include "command.h" +#include "utils.h" +#include "pl_thread.h" + +#ifdef PL_HAVE_UNIX +#include <errno.h> +#include <unistd.h> +#endif + +// Controls the page size alignment, to help coalesce allocations into the same +// slab. Pages are rounded up to multiples of this value. (Default: 4 KB) +#define PAGE_SIZE_ALIGN (1LLU << 12) + +// Controls the minimum/maximum number of pages for new slabs. As slabs are +// exhausted of memory, the number of pages per new slab grows exponentially, +// starting with the minimum until the maximum is reached. +// +// Note: The maximum must never exceed the size of `vk_slab.spacemap`. +#define MINIMUM_PAGE_COUNT 4 +#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8) + +// Controls the maximum page size. Any allocations above this threshold +// (absolute size or fraction of VRAM, whichever is higher) will be served by +// dedicated allocations. (Default: 64 MB or 1/16 of VRAM) +#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26) +#define MAXIMUM_PAGE_SIZE_RELATIVE 16 + +// Controls the minimum slab size, to avoid excessive re-allocation of very +// small slabs. (Default: 256 KB) +#define MINIMUM_SLAB_SIZE (1LLU << 18) + +// How long to wait before garbage collecting empty slabs. Slabs older than +// this many invocations of `vk_malloc_garbage_collect` will be released. +#define MAXIMUM_SLAB_AGE 32 + +// A single slab represents a contiguous region of allocated memory. Actual +// allocations are served as pages of this. Slabs are organized into pools, +// each of which contains a list of slabs of differing page sizes. +struct vk_slab { + pl_mutex lock; + pl_debug_tag debug_tag; // debug tag of the triggering allocation + VkDeviceMemory mem; // underlying device allocation + VkDeviceSize size; // total allocated size of `mem` + VkMemoryType mtype; // underlying memory type + bool dedicated; // slab is allocated specifically for one object + bool imported; // slab represents an imported memory allocation + + // free space accounting (only for non-dedicated slabs) + uint64_t spacemap; // bitset of available pages + size_t pagesize; // size in bytes per page + size_t used; // number of bytes actually in use + uint64_t age; // timestamp of last use + + // optional, depends on the memory type: + VkBuffer buffer; // buffer spanning the entire slab + void *data; // mapped memory corresponding to `mem` + bool coherent; // mapped memory is coherent + union pl_handle handle; // handle associated with this device memory + enum pl_handle_type handle_type; +}; + +// Represents a single memory pool. We keep track of a vk_pool for each +// combination of malloc parameters. This shouldn't actually be that many in +// practice, because some combinations simply never occur, and others will +// generally be the same for the same objects. +// +// Note: `vk_pool` addresses are not immutable, so we mustn't expose any +// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`. +struct vk_pool { + struct vk_malloc_params params; // allocation params (with some fields nulled) + PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted + int index; // running index in `vk_malloc.pools` +}; + +// The overall state of the allocator, which keeps track of a vk_pool for each +// memory type. +struct vk_malloc { + struct vk_ctx *vk; + pl_mutex lock; + VkPhysicalDeviceMemoryProperties props; + size_t maximum_page_size; + PL_ARRAY(struct vk_pool) pools; + uint64_t age; +}; + +static inline float efficiency(size_t used, size_t total) +{ + if (!total) + return 100.0; + + return 100.0f * used / total; +} + +static const char *print_size(char buf[8], size_t size) +{ + const char *suffixes = "\0KMG"; + while (suffixes[1] && size > 9999) { + size >>= 10; + suffixes++; + } + + int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes) + : snprintf(buf, 8, "%5zu", size); + + return ret >= 0 ? buf : "(error)"; +} + +#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x))) + +void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev) +{ + struct vk_ctx *vk = ma->vk; + size_t total_size = 0; + size_t total_used = 0; + size_t total_res = 0; + + PL_MSG(vk, lev, "Memory heaps supported by device:"); + for (int i = 0; i < ma->props.memoryHeapCount; i++) { + VkMemoryHeap heap = ma->props.memoryHeaps[i]; + PL_MSG(vk, lev, " %d: flags 0x%x size %s", + i, (unsigned) heap.flags, PRINT_SIZE(heap.size)); + } + + PL_DEBUG(vk, "Memory types supported by device:"); + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + VkMemoryType type = ma->props.memoryTypes[i]; + PL_DEBUG(vk, " %d: flags 0x%x heap %d", + i, (unsigned) type.propertyFlags, (int) type.heapIndex); + } + + pl_mutex_lock(&ma->lock); + for (int i = 0; i < ma->pools.num; i++) { + struct vk_pool *pool = &ma->pools.elem[i]; + const struct vk_malloc_params *par = &pool->params; + + PL_MSG(vk, lev, "Memory pool %d:", i); + PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits); + if (par->required) + PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required); + if (par->optimal) + PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal); + if (par->buf_usage) + PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage); + if (par->export_handle) + PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle); + + size_t pool_size = 0; + size_t pool_used = 0; + size_t pool_res = 0; + + for (int j = 0; j < pool->slabs.num; j++) { + struct vk_slab *slab = pool->slabs.elem[j]; + pl_mutex_lock(&slab->lock); + + size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize; + size_t slab_res = slab->size - avail; + + PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: " + "%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]", + j, slab->spacemap, PRINT_SIZE(slab->pagesize), + PRINT_SIZE(slab->used), PRINT_SIZE(slab_res), + PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex, + efficiency(slab->used, slab_res), + PL_DEF(slab->debug_tag, "unknown")); + + pool_size += slab->size; + pool_used += slab->used; + pool_res += slab_res; + pl_mutex_unlock(&slab->lock); + } + + PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, " + "efficiency %.2f%%, utilization %.2f%%", + PRINT_SIZE(pool_used), PRINT_SIZE(pool_res), + PRINT_SIZE(pool_size), efficiency(pool_used, pool_res), + efficiency(pool_res, pool_size)); + + total_size += pool_size; + total_used += pool_used; + total_res += pool_res; + } + pl_mutex_unlock(&ma->lock); + + PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, " + "efficiency %.2f%%, utilization %.2f%%, max page: %s", + PRINT_SIZE(total_used), PRINT_SIZE(total_res), + PRINT_SIZE(total_size), efficiency(total_used, total_res), + efficiency(total_res, total_size), + PRINT_SIZE(ma->maximum_page_size)); +} + +static void slab_free(struct vk_ctx *vk, struct vk_slab *slab) +{ + if (!slab) + return; + +#ifndef NDEBUG + if (!slab->dedicated && slab->used > 0) { + PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used); + PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64, + (size_t) slab->size, (int) slab->mtype.heapIndex, + (uint64_t) slab->mtype.propertyFlags); + if (slab->debug_tag) + PL_WARN(vk, "last used for: %s", slab->debug_tag); + pl_log_stack_trace(vk->log, PL_LOG_WARN); + pl_debug_abort(); + } +#endif + + if (slab->imported) { + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: + PL_TRACE(vk, "Unimporting slab of size %s from fd: %d", + PRINT_SIZE(slab->size), slab->handle.fd); + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: +#ifdef PL_HAVE_WIN32 + PL_TRACE(vk, "Unimporting slab of size %s from handle: %p", + PRINT_SIZE(slab->size), (void *) slab->handle.handle); +#endif + break; + case PL_HANDLE_HOST_PTR: + PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p", + PRINT_SIZE(slab->size), (void *) slab->handle.ptr); + break; + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + pl_unreachable(); + } + } else { + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: +#ifdef PL_HAVE_UNIX + if (slab->handle.fd > -1) + close(slab->handle.fd); +#endif + break; + case PL_HANDLE_WIN32: +#ifdef PL_HAVE_WIN32 + if (slab->handle.handle != NULL) + CloseHandle(slab->handle.handle); +#endif + break; + case PL_HANDLE_WIN32_KMT: + // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. + break; + case PL_HANDLE_HOST_PTR: + // Implicitly unmapped + break; + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + pl_unreachable(); + } + + PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size)); + } + + vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC); + // also implicitly unmaps the memory if needed + vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC); + + pl_mutex_destroy(&slab->lock); + pl_free(slab); +} + +// type_mask: optional +// thread-safety: safe +static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask, + const struct vk_malloc_params *params, + uint32_t *out_index) +{ + struct vk_ctx *vk = ma->vk; + int best = -1; + + // The vulkan spec requires memory types to be sorted in the "optimal" + // order, so the first matching type we find will be the best/fastest one. + // That being said, we still want to prioritize memory types that have + // better optional flags. + + type_mask &= params->reqs.memoryTypeBits; + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + const VkMemoryType *mtype = &ma->props.memoryTypes[i]; + + // The memory type flags must include our properties + if ((mtype->propertyFlags & params->required) != params->required) + continue; + + // The memory heap must be large enough for the allocation + VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size; + if (params->reqs.size > heapSize) + continue; + + // The memory type must be supported by the type mask (bitfield) + if (!(type_mask & (1LU << i))) + continue; + + // Calculate the score as the number of optimal property flags matched + int score = __builtin_popcountl(mtype->propertyFlags & params->optimal); + if (score > best) { + *out_index = i; + best = score; + } + } + + if (best < 0) { + PL_ERR(vk, "Found no memory type matching property flags 0x%x and type " + "bits 0x%x!", + (unsigned) params->required, (unsigned) type_mask); + return false; + } + + return true; +} + +static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage, + enum pl_handle_type handle_type, bool import) +{ + if (!handle_type) + return true; + + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR, + .usage = usage, + .handleType = vk_mem_handle_type(handle_type), + }; + + VkExternalBufferProperties props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR, + }; + + if (!info.handleType) + return false; + + vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props); + return vk_external_mem_check(vk, &props.externalMemoryProperties, + handle_type, import); +} + +// thread-safety: safe +static struct vk_slab *slab_alloc(struct vk_malloc *ma, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + struct vk_slab *slab = pl_alloc_ptr(NULL, slab); + *slab = (struct vk_slab) { + .age = ma->age, + .size = params->reqs.size, + .handle_type = params->export_handle, + .debug_tag = params->debug_tag, + }; + pl_mutex_init(&slab->lock); + + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: + slab->handle.fd = -1; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + slab->handle.handle = NULL; + break; + case PL_HANDLE_HOST_PTR: + slab->handle.ptr = NULL; + break; + } + + VkExportMemoryAllocateInfoKHR ext_info = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR, + .handleTypes = vk_mem_handle_type(slab->handle_type), + }; + + uint32_t type_mask = UINT32_MAX; + if (params->buf_usage) { + // Queue family sharing modes don't matter for buffers, so we just + // set them as concurrent and stop worrying about it. + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, + .handleTypes = ext_info.handleTypes, + }; + + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = slab->handle_type ? &ext_buf_info : NULL, + .size = slab->size, + .usage = params->buf_usage, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) { + PL_ERR(vk, "Failed allocating shared memory buffer: possibly " + "the handle type is unsupported?"); + goto error; + } + + VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer)); + PL_VK_NAME(BUFFER, slab->buffer, "slab"); + + VkMemoryRequirements reqs = {0}; + vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs); + slab->size = reqs.size; // this can be larger than `slab->size` + type_mask = reqs.memoryTypeBits; + + // Note: we can ignore `reqs.align` because we always bind the buffer + // memory to offset 0 + } + + VkMemoryAllocateInfo minfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = slab->size, + }; + + if (params->export_handle) + vk_link_struct(&minfo, &ext_info); + + VkMemoryDedicatedAllocateInfoKHR dinfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, + .image = params->ded_image, + }; + + if (params->ded_image) + vk_link_struct(&minfo, &dinfo); + + if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex)) + goto error; + + const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex]; + PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s", + (size_t) slab->size, (unsigned) mtype->propertyFlags, + (int) minfo.memoryTypeIndex, (int) mtype->heapIndex, + PL_DEF(params->debug_tag, "unknown")); + + pl_clock_t start = pl_clock_now(); + + VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem); + switch (res) { + case VK_ERROR_OUT_OF_DEVICE_MEMORY: + case VK_ERROR_OUT_OF_HOST_MEMORY: + PL_ERR(vk, "Allocation of size %s failed: %s!", + PRINT_SIZE(slab->size), vk_res_str(res)); + vk_malloc_print_stats(ma, PL_LOG_ERR); + pl_log_stack_trace(vk->log, PL_LOG_ERR); + pl_debug_abort(); + goto error; + + default: + PL_VK_ASSERT(res, "vkAllocateMemory"); + } + + slab->mtype = *mtype; + if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + } + + if (slab->buffer) + VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0)); + +#ifdef PL_HAVE_UNIX + if (slab->handle_type == PL_HANDLE_FD || + slab->handle_type == PL_HANDLE_DMA_BUF) + { + VkMemoryGetFdInfoKHR fd_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, + .memory = slab->mem, + .handleType = ext_info.handleTypes, + }; + + VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd)); + } +#endif + +#ifdef PL_HAVE_WIN32 + if (slab->handle_type == PL_HANDLE_WIN32 || + slab->handle_type == PL_HANDLE_WIN32_KMT) + { + VkMemoryGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + .memory = slab->mem, + .handleType = ext_info.handleTypes, + }; + + VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info, + &slab->handle.handle)); + } +#endif + + pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab"); + + // free space accounting is done by the caller + return slab; + +error: + if (params->debug_tag) + PL_ERR(vk, " for malloc: %s", params->debug_tag); + slab_free(vk, slab); + return NULL; +} + +static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool) +{ + for (int i = 0; i < pool->slabs.num; i++) + slab_free(vk, pool->slabs.elem[i]); + + pl_free(pool->slabs.elem); + *pool = (struct vk_pool) {0}; +} + +struct vk_malloc *vk_malloc_create(struct vk_ctx *vk) +{ + struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma); + pl_mutex_init(&ma->lock); + vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props); + ma->vk = vk; + + // Determine maximum page size + ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE; + for (int i = 0; i < ma->props.memoryHeapCount; i++) { + VkMemoryHeap heap = ma->props.memoryHeaps[i]; + if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { + size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE; + ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max); + } + } + + vk_malloc_print_stats(ma, PL_LOG_INFO); + return ma; +} + +void vk_malloc_destroy(struct vk_malloc **ma_ptr) +{ + struct vk_malloc *ma = *ma_ptr; + if (!ma) + return; + + vk_malloc_print_stats(ma, PL_LOG_DEBUG); + for (int i = 0; i < ma->pools.num; i++) + pool_uninit(ma->vk, &ma->pools.elem[i]); + + pl_mutex_destroy(&ma->lock); + pl_free_ptr(ma_ptr); +} + +void vk_malloc_garbage_collect(struct vk_malloc *ma) +{ + struct vk_ctx *vk = ma->vk; + + pl_mutex_lock(&ma->lock); + ma->age++; + + for (int i = 0; i < ma->pools.num; i++) { + struct vk_pool *pool = &ma->pools.elem[i]; + for (int n = 0; n < pool->slabs.num; n++) { + struct vk_slab *slab = pool->slabs.elem[n]; + pl_mutex_lock(&slab->lock); + if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) { + pl_mutex_unlock(&slab->lock); + continue; + } + + PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d", + PRINT_SIZE(slab->size), pool->index); + + pl_mutex_unlock(&slab->lock); + slab_free(ma->vk, slab); + PL_ARRAY_REMOVE_AT(pool->slabs, n--); + } + } + + pl_mutex_unlock(&ma->lock); +} + +pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import) +{ + struct vk_ctx *vk = ma->vk; + pl_handle_caps caps = 0; + + for (int i = 0; vk_mem_handle_list[i]; i++) { + // Try seeing if we could allocate a "basic" buffer using these + // capabilities, with no fancy buffer usage. More specific checks will + // happen down the line at VkBuffer creation time, but this should give + // us a rough idea of what the driver supports. + enum pl_handle_type type = vk_mem_handle_list[i]; + if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import)) + caps |= type; + } + + return caps; +} + +void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice) +{ + struct vk_ctx *vk = ma->vk; + struct vk_slab *slab = slice->priv; + if (!slab || slab->dedicated) { + slab_free(vk, slab); + goto done; + } + + pl_mutex_lock(&slab->lock); + + int page_idx = slice->offset / slab->pagesize; + slab->spacemap |= 0x1LLU << page_idx; + slab->used -= slice->size; + slab->age = ma->age; + pl_assert(slab->used >= 0); + + pl_mutex_unlock(&slab->lock); + +done: + *slice = (struct vk_memslice) {0}; +} + +static inline bool pool_params_eq(const struct vk_malloc_params *a, + const struct vk_malloc_params *b) +{ + return a->reqs.size == b->reqs.size && + a->reqs.alignment == b->reqs.alignment && + a->reqs.memoryTypeBits == b->reqs.memoryTypeBits && + a->required == b->required && + a->optimal == b->optimal && + a->buf_usage == b->buf_usage && + a->export_handle == b->export_handle; +} + +static struct vk_pool *find_pool(struct vk_malloc *ma, + const struct vk_malloc_params *params) +{ + pl_assert(!params->import_handle); + pl_assert(!params->ded_image); + + struct vk_malloc_params fixed = *params; + fixed.reqs.alignment = 0; + fixed.reqs.size = 0; + fixed.shared_mem = (struct pl_shared_mem) {0}; + + for (int i = 0; i < ma->pools.num; i++) { + if (pool_params_eq(&ma->pools.elem[i].params, &fixed)) + return &ma->pools.elem[i]; + } + + // Not found => add it + PL_ARRAY_GROW(ma, ma->pools); + size_t idx = ma->pools.num++; + ma->pools.elem[idx] = (struct vk_pool) { + .params = fixed, + .index = idx, + }; + return &ma->pools.elem[idx]; +} + +// Returns a suitable memory page from the pool. A new slab will be allocated +// under the hood, if necessary. +// +// Note: This locks the slab it returns +static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool, + size_t size, size_t align, + VkDeviceSize *offset) +{ + struct vk_slab *slab = NULL; + int slab_pages = MINIMUM_PAGE_COUNT; + size = PL_ALIGN2(size, PAGE_SIZE_ALIGN); + const size_t pagesize = PL_ALIGN(size, align); + + for (int i = 0; i < pool->slabs.num; i++) { + slab = pool->slabs.elem[i]; + if (slab->pagesize < size) + continue; + if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic + continue; + if (slab->pagesize % align) + continue; + + pl_mutex_lock(&slab->lock); + int page_idx = __builtin_ffsll(slab->spacemap); + if (!page_idx--) { + pl_mutex_unlock(&slab->lock); + // Increase the number of slabs to allocate for new slabs the + // more existing full slabs exist for this size range + slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT); + continue; + } + + slab->spacemap ^= 0x1LLU << page_idx; + *offset = page_idx * slab->pagesize; + return slab; + } + + // Otherwise, allocate a new vk_slab and append it to the list. + VkDeviceSize slab_size = slab_pages * pagesize; + pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT); + const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT; + pl_assert(pagesize <= ma->maximum_page_size); + slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size); + slab_pages = slab_size / pagesize; + slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess + + struct vk_malloc_params params = pool->params; + params.reqs.size = slab_size; + + // Don't hold the lock while allocating the slab, because it can be a + // potentially very costly operation. + pl_mutex_unlock(&ma->lock); + slab = slab_alloc(ma, ¶ms); + pl_mutex_lock(&ma->lock); + if (!slab) + return NULL; + pl_mutex_lock(&slab->lock); + + slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages); + slab->pagesize = pagesize; + PL_ARRAY_APPEND(NULL, pool->slabs, slab); + + // Return the first page in this newly allocated slab + slab->spacemap ^= 0x1; + *offset = 0; + return slab; +} + +static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type; + vk_handle_type = vk_mem_handle_type(params->import_handle); + + struct vk_slab *slab = NULL; + const struct pl_shared_mem *shmem = ¶ms->shared_mem; + + VkMemoryDedicatedAllocateInfoKHR dinfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, + .image = params->ded_image, + }; + + VkImportMemoryFdInfoKHR fdinfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = vk_handle_type, + .fd = -1, + }; + + VkImportMemoryHostPointerInfoEXT ptrinfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT, + .handleType = vk_handle_type, + }; + + VkMemoryAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = shmem->size, + }; + + if (params->ded_image) + vk_link_struct(&ainfo, &dinfo); + + VkBuffer buffer = VK_NULL_HANDLE; + VkMemoryRequirements reqs = params->reqs; + + if (params->buf_usage) { + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, + .handleTypes = vk_handle_type, + }; + + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = &ext_buf_info, + .size = shmem->size, + .usage = params->buf_usage, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer)); + PL_VK_NAME(BUFFER, buffer, "imported"); + + vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs); + } + + if (reqs.size > shmem->size) { + PL_ERR(vk, "Imported object requires %zu bytes, larger than the " + "provided size %zu!", + (size_t) reqs.size, shmem->size); + goto error; + } + + if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) { + PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!", + shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment)); + goto error; + } + + switch (params->import_handle) { +#ifdef PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: { + if (!vk->GetMemoryFdPropertiesKHR) { + PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.", + VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME); + goto error; + } + + VkMemoryFdPropertiesKHR fdprops = { + .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR, + }; + + VK(vk->GetMemoryFdPropertiesKHR(vk->dev, + vk_handle_type, + shmem->handle.fd, + &fdprops)); + + // We dup() the fd to make it safe to import the same original fd + // multiple times. + fdinfo.fd = dup(shmem->handle.fd); + if (fdinfo.fd == -1) { + PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s", + fdinfo.fd, strerror(errno)); + goto error; + } + + reqs.memoryTypeBits &= fdprops.memoryTypeBits; + vk_link_struct(&ainfo, &fdinfo); + break; + } +#else // !PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: + PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!"); + goto error; +#endif + + case PL_HANDLE_HOST_PTR: { + VkMemoryHostPointerPropertiesEXT ptrprops = { + .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT, + }; + + VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type, + shmem->handle.ptr, + &ptrprops)); + + ptrinfo.pHostPointer = (void *) shmem->handle.ptr; + reqs.memoryTypeBits &= ptrprops.memoryTypeBits; + vk_link_struct(&ainfo, &ptrinfo); + break; + } + + case PL_HANDLE_FD: + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + PL_ERR(vk, "vk_malloc_import: unsupported handle type %d", + params->import_handle); + goto error; + } + + if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) { + PL_ERR(vk, "No compatible memory types offered for imported memory!"); + goto error; + } + + VkDeviceMemory vkmem = VK_NULL_HANDLE; + VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem)); + + slab = pl_alloc_ptr(NULL, slab); + *slab = (struct vk_slab) { + .mem = vkmem, + .dedicated = true, + .imported = true, + .buffer = buffer, + .size = shmem->size, + .handle_type = params->import_handle, + }; + pl_mutex_init(&slab->lock); + + *out = (struct vk_memslice) { + .vkmem = vkmem, + .buf = buffer, + .size = shmem->size - shmem->offset, + .offset = shmem->offset, + .shared_mem = *shmem, + .priv = slab, + }; + + switch (params->import_handle) { + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_FD: + PL_TRACE(vk, "Imported %s bytes from fd: %d%s", + PRINT_SIZE(slab->size), shmem->handle.fd, + params->ded_image ? " (dedicated)" : ""); + // fd ownership is transferred at this point. + slab->handle.fd = fdinfo.fd; + fdinfo.fd = -1; + break; + case PL_HANDLE_HOST_PTR: + PL_TRACE(vk, "Imported %s bytes from ptr: %p%s", + PRINT_SIZE(slab->size), shmem->handle.ptr, + params->ded_image ? " (dedicated" : ""); + slab->handle.ptr = ptrinfo.pHostPointer; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + break; + } + + VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags; + if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + out->data = (uint8_t *) slab->data + out->offset; + out->coherent = slab->coherent; + if (!slab->coherent) { + // Use entire buffer range, since this is a dedicated memory + // allocation. This avoids issues with noncoherent atomicity + out->map_offset = 0; + out->map_size = VK_WHOLE_SIZE; + + // Mapping does not implicitly invalidate mapped memory + VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = slab->mem, + .offset = out->map_offset, + .size = out->map_size, + })); + } + } + + if (buffer) + VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0)); + + return true; + +error: + if (params->debug_tag) + PL_ERR(vk, " for malloc: %s", params->debug_tag); + vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC); +#ifdef PL_HAVE_UNIX + if (fdinfo.fd > -1) + close(fdinfo.fd); +#endif + pl_free(slab); + *out = (struct vk_memslice) {0}; + return false; +} + +size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags) +{ + size_t avail = 0; + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + const VkMemoryType *mtype = &ma->props.memoryTypes[i]; + if ((mtype->propertyFlags & flags) != flags) + continue; + avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size); + } + + return avail; +} + +bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + pl_assert(!params->import_handle || !params->export_handle); + if (params->import_handle) + return vk_malloc_import(ma, out, params); + + pl_assert(params->reqs.size); + size_t size = params->reqs.size; + size_t align = params->reqs.alignment; + align = pl_lcm(align, vk->props.limits.bufferImageGranularity); + align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize); + + struct vk_slab *slab; + VkDeviceSize offset; + + if (params->ded_image || size > ma->maximum_page_size) { + slab = slab_alloc(ma, params); + if (!slab) + return false; + slab->dedicated = true; + offset = 0; + } else { + pl_mutex_lock(&ma->lock); + struct vk_pool *pool = find_pool(ma, params); + slab = pool_get_page(ma, pool, size, align, &offset); + pl_mutex_unlock(&ma->lock); + if (!slab) { + PL_ERR(ma->vk, "No slab to serve request for %s bytes (with " + "alignment 0x%zx) in pool %d!", + PRINT_SIZE(size), align, pool->index); + return false; + } + + // For accounting, just treat the alignment as part of the used size. + // Doing it this way makes sure that the sizes reported to vk_memslice + // consumers are always aligned properly. + size = PL_ALIGN(size, align); + slab->used += size; + slab->age = ma->age; + if (params->debug_tag) + slab->debug_tag = params->debug_tag; + pl_mutex_unlock(&slab->lock); + } + + pl_assert(offset % align == 0); + *out = (struct vk_memslice) { + .vkmem = slab->mem, + .offset = offset, + .size = size, + .buf = slab->buffer, + .data = slab->data ? (uint8_t *) slab->data + offset : 0x0, + .coherent = slab->coherent, + .map_offset = slab->data ? offset : 0, + .map_size = slab->data ? size : 0, + .priv = slab, + .shared_mem = { + .handle = slab->handle, + .offset = offset, + .size = slab->size, + }, + }; + return true; +} diff --git a/src/vulkan/malloc.h b/src/vulkan/malloc.h new file mode 100644 index 0000000..115352e --- /dev/null +++ b/src/vulkan/malloc.h @@ -0,0 +1,72 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +// All memory allocated from a vk_malloc MUST be explicitly released by +// the caller before vk_malloc_destroy is called. +struct vk_malloc *vk_malloc_create(struct vk_ctx *vk); +void vk_malloc_destroy(struct vk_malloc **ma); + +// Get the supported handle types for this malloc instance +pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import); + +// Represents a single "slice" of generic (non-buffer) memory, plus some +// metadata for accounting. This struct is essentially read-only. +struct vk_memslice { + VkDeviceMemory vkmem; + VkDeviceSize offset; + VkDeviceSize size; + void *priv; + // depending on the type/flags: + struct pl_shared_mem shared_mem; + VkBuffer buf; // associated buffer (when `buf_usage` is nonzero) + void *data; // pointer to slice (for persistently mapped slices) + bool coherent; // whether `data` is coherent + VkDeviceSize map_offset; // can be larger than offset/size + VkDeviceSize map_size; +}; + +struct vk_malloc_params { + VkMemoryRequirements reqs; + VkMemoryPropertyFlags required; + VkMemoryPropertyFlags optimal; + VkBufferUsageFlags buf_usage; + VkImage ded_image; // for dedicated image allocations + enum pl_handle_type export_handle; + enum pl_handle_type import_handle; + struct pl_shared_mem shared_mem; // for `import_handle` + pl_debug_tag debug_tag; +}; + +// Returns the amount of available memory matching a given set of property +// flags. Always returns the highest single allocation, not the combined total. +size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags); + +bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params); + +void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice); + +// Clean up unused slabs. Call this roughly once per frame to reduce +// memory pressure / memory leaks. +void vk_malloc_garbage_collect(struct vk_malloc *ma); + +// For debugging purposes. Doesn't include dedicated slab allocations! +void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level); diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build new file mode 100644 index 0000000..64c5572 --- /dev/null +++ b/src/vulkan/meson.build @@ -0,0 +1,59 @@ +vulkan_build = get_option('vulkan') +vulkan_link = get_option('vk-proc-addr') +vulkan_loader = dependency('vulkan', required: false) +vulkan_headers = vulkan_loader.partial_dependency(includes: true, compile_args: true) +registry_xml = get_option('vulkan-registry') + +# Prefer our Vulkan headers for portability +vulkan_headers_dir = thirdparty/'Vulkan-Headers' +vulkan_headers_inc = include_directories() +if fs.is_dir(vulkan_headers_dir/'include') + vulkan_headers = declare_dependency() + vulkan_headers_inc = include_directories('../../3rdparty/Vulkan-Headers/include') + # Force the use of this vk.xml because it has to be in sync with the headers + registry_xml = vulkan_headers_dir/'registry/vk.xml' +endif + +vulkan_build = vulkan_build.require( + cc.has_header_symbol('vulkan/vulkan_core.h', 'VK_VERSION_1_3', + include_directories: vulkan_headers_inc, + dependencies: vulkan_headers), + error_message: 'vulkan.h was not found on the system, nor inside ' + + '`3rdparty/Vulkan-Headers`. Please run `git submodule update --init` ' + + 'followed by `meson --wipe`.') +components.set('vulkan', vulkan_build.allowed()) + +vulkan_link = vulkan_link.require(vulkan_loader.found() and vulkan_build.allowed()) +components.set('vk-proc-addr', vulkan_link.allowed()) + +build_deps += vulkan_headers + +if vulkan_build.allowed() + sources += [ + 'vulkan/command.c', + 'vulkan/context.c', + 'vulkan/formats.c', + 'vulkan/gpu.c', + 'vulkan/gpu_buf.c', + 'vulkan/gpu_tex.c', + 'vulkan/gpu_pass.c', + 'vulkan/malloc.c', + 'vulkan/swapchain.c', + 'vulkan/utils.c', + ] + + datadir = get_option('prefix') / get_option('datadir') + sources += custom_target('utils_gen.c', + input: 'utils_gen.py', + output: 'utils_gen.c', + command: [python, '@INPUT@', datadir, registry_xml, '@OUTPUT@'], + env: python_env, + ) + + if vulkan_link.allowed() + build_deps += vulkan_loader + tests += 'vulkan.c' + endif +else + sources += 'vulkan/stubs.c' +endif diff --git a/src/vulkan/stubs.c b/src/vulkan/stubs.c new file mode 100644 index 0000000..0c0738e --- /dev/null +++ b/src/vulkan/stubs.c @@ -0,0 +1,108 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "../common.h" +#include "log.h" + +#include <libplacebo/vulkan.h> + +const struct pl_vk_inst_params pl_vk_inst_default_params = {0}; +const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS }; + +pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params) +{ + pl_fatal(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +void pl_vk_inst_destroy(pl_vk_inst *pinst) +{ + pl_vk_inst inst = *pinst; + pl_assert(!inst); +} + +pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params) +{ + pl_fatal(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +void pl_vulkan_destroy(pl_vulkan *pvk) +{ + pl_vulkan vk = *pvk; + pl_assert(!vk); +} + +pl_vulkan pl_vulkan_get(pl_gpu gpu) +{ + return NULL; +} + +VkPhysicalDevice pl_vulkan_choose_device(pl_log log, + const struct pl_vulkan_device_params *params) +{ + pl_err(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk, + const struct pl_vulkan_swapchain_params *params) +{ + pl_unreachable(); +} + +bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw) +{ + pl_unreachable(); +} + +pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params) +{ + pl_fatal(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params) +{ + pl_unreachable(); +} + +VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, + VkFormat *out_format, VkImageUsageFlags *out_flags) +{ + pl_unreachable(); +} + +bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params) +{ + pl_unreachable(); +} + +void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params) +{ + pl_unreachable(); +} + +VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params) +{ + pl_unreachable(); +} + +void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore) +{ + pl_unreachable(); +} diff --git a/src/vulkan/swapchain.c b/src/vulkan/swapchain.c new file mode 100644 index 0000000..0741fbf --- /dev/null +++ b/src/vulkan/swapchain.c @@ -0,0 +1,911 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "command.h" +#include "formats.h" +#include "utils.h" +#include "gpu.h" +#include "swapchain.h" +#include "pl_thread.h" + +struct sem_pair { + VkSemaphore in; + VkSemaphore out; +}; + +struct priv { + struct pl_sw_fns impl; + + pl_mutex lock; + struct vk_ctx *vk; + VkSurfaceKHR surf; + PL_ARRAY(VkSurfaceFormatKHR) formats; + + // current swapchain and metadata: + struct pl_vulkan_swapchain_params params; + VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype + VkSwapchainKHR swapchain; + int cur_width, cur_height; + int swapchain_depth; + pl_rc_t frames_in_flight; // number of frames currently queued + bool suboptimal; // true once VK_SUBOPTIMAL_KHR is returned + bool needs_recreate; // swapchain needs to be recreated + struct pl_color_repr color_repr; + struct pl_color_space color_space; + struct pl_hdr_metadata hdr_metadata; + + // state of the images: + PL_ARRAY(pl_tex) images; // pl_tex wrappers for the VkImages + PL_ARRAY(struct sem_pair) sems; // pool of semaphores used to synchronize images + int idx_sems; // index of next free semaphore pair + int last_imgidx; // the image index last acquired (for submit) +}; + +static const struct pl_sw_fns vulkan_swapchain; + +static bool map_color_space(VkColorSpaceKHR space, struct pl_color_space *out) +{ + switch (space) { + // Note: This is technically against the spec, but more often than not + // it's the correct result since `SRGB_NONLINEAR` is just a catch-all + // for any sort of typical SDR curve, which is better approximated by + // `pl_color_space_monitor`. + case VK_COLOR_SPACE_SRGB_NONLINEAR_KHR: + *out = pl_color_space_monitor; + return true; + + case VK_COLOR_SPACE_BT709_NONLINEAR_EXT: + *out = pl_color_space_monitor; + return true; + case VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DISPLAY_P3, + .transfer = PL_COLOR_TRC_BT_1886, + }; + return true; + case VK_COLOR_SPACE_DCI_P3_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DCI_P3, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DCI_P3, + .transfer = PL_COLOR_TRC_BT_1886, + }; + return true; + case VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT: + case VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT: + // TODO + return false; + case VK_COLOR_SPACE_BT709_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DCI_P3, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_BT2020_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_HDR10_ST2084_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_PQ, + }; + return true; + case VK_COLOR_SPACE_DOLBYVISION_EXT: + // Unlikely to ever be implemented + return false; + case VK_COLOR_SPACE_HDR10_HLG_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_HLG, + }; + return true; + case VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_ADOBE, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_ADOBE, + .transfer = PL_COLOR_TRC_GAMMA22, + }; + return true; + case VK_COLOR_SPACE_PASS_THROUGH_EXT: + *out = pl_color_space_unknown; + return true; + +#ifdef VK_AMD_display_native_hdr + case VK_COLOR_SPACE_DISPLAY_NATIVE_AMD: + // TODO + return false; +#endif + + default: return false; + } +} + +static bool pick_surf_format(pl_swapchain sw, const struct pl_color_space *hint) +{ + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + pl_gpu gpu = sw->gpu; + + int best_score = 0, best_id; + bool wide_gamut = pl_color_primaries_is_wide_gamut(hint->primaries); + bool prefer_hdr = pl_color_transfer_is_hdr(hint->transfer); + + for (int i = 0; i < p->formats.num; i++) { + // Color space / format whitelist + struct pl_color_space space; + if (!map_color_space(p->formats.elem[i].colorSpace, &space)) + continue; + + bool disable10 = !pl_color_transfer_is_hdr(space.transfer) && + p->params.disable_10bit_sdr; + + switch (p->formats.elem[i].format) { + // Only accept floating point formats for linear curves + case VK_FORMAT_R16G16B16_SFLOAT: + case VK_FORMAT_R16G16B16A16_SFLOAT: + case VK_FORMAT_R32G32B32_SFLOAT: + case VK_FORMAT_R32G32B32A32_SFLOAT: + case VK_FORMAT_R64G64B64_SFLOAT: + case VK_FORMAT_R64G64B64A64_SFLOAT: + if (space.transfer == PL_COLOR_TRC_LINEAR) + break; // accept + continue; + + // Only accept 8 bit for non-HDR curves + case VK_FORMAT_R8G8B8_UNORM: + case VK_FORMAT_B8G8R8_UNORM: + case VK_FORMAT_R8G8B8A8_UNORM: + case VK_FORMAT_B8G8R8A8_UNORM: + case VK_FORMAT_A8B8G8R8_UNORM_PACK32: + if (!pl_color_transfer_is_hdr(space.transfer)) + break; // accept + continue; + + // Only accept 10 bit formats for non-linear curves + case VK_FORMAT_A2R10G10B10_UNORM_PACK32: + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + if (space.transfer != PL_COLOR_TRC_LINEAR && !disable10) + break; // accept + continue; + + // Accept 16-bit formats for everything + case VK_FORMAT_R16G16B16_UNORM: + case VK_FORMAT_R16G16B16A16_UNORM: + if (!disable10) + break; // accept + continue; + + default: continue; + } + + // Make sure we can wrap this format to a meaningful, valid pl_fmt + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt plfmt = gpu->formats[n]; + const struct vk_format **pvkfmt = PL_PRIV(plfmt); + if ((*pvkfmt)->tfmt != p->formats.elem[i].format) + continue; + + enum pl_fmt_caps render_caps = 0; + render_caps |= PL_FMT_CAP_RENDERABLE; + render_caps |= PL_FMT_CAP_BLITTABLE; + if ((plfmt->caps & render_caps) != render_caps) + continue; + + // format valid, use it if it has a higher score + int score = 0; + for (int c = 0; c < 3; c++) + score += plfmt->component_depth[c]; + if (pl_color_primaries_is_wide_gamut(space.primaries) == wide_gamut) + score += 1000; + if (space.primaries == hint->primaries) + score += 2000; + if (pl_color_transfer_is_hdr(space.transfer) == prefer_hdr) + score += 10000; + if (space.transfer == hint->transfer) + score += 20000; + + switch (plfmt->type) { + case PL_FMT_UNKNOWN: break; + case PL_FMT_UINT: break; + case PL_FMT_SINT: break; + case PL_FMT_UNORM: score += 500; break; + case PL_FMT_SNORM: score += 400; break; + case PL_FMT_FLOAT: score += 300; break; + case PL_FMT_TYPE_COUNT: pl_unreachable(); + }; + + if (score > best_score) { + best_score = score; + best_id = i; + break; + } + } + } + + if (!best_score) { + PL_ERR(vk, "Failed picking any valid, renderable surface format!"); + return false; + } + + VkSurfaceFormatKHR new_sfmt = p->formats.elem[best_id]; + if (p->protoInfo.imageFormat != new_sfmt.format || + p->protoInfo.imageColorSpace != new_sfmt.colorSpace) + { + PL_INFO(vk, "Picked surface configuration %d: %s + %s", best_id, + vk_fmt_name(new_sfmt.format), + vk_csp_name(new_sfmt.colorSpace)); + + p->protoInfo.imageFormat = new_sfmt.format; + p->protoInfo.imageColorSpace = new_sfmt.colorSpace; + p->needs_recreate = true; + } + + return true; +} + +static void set_hdr_metadata(struct priv *p, const struct pl_hdr_metadata *metadata) +{ + struct vk_ctx *vk = p->vk; + if (!vk->SetHdrMetadataEXT) + return; + + // Whitelist only values that we support signalling metadata for + struct pl_hdr_metadata fix = { + .prim = metadata->prim, + .min_luma = metadata->min_luma, + .max_luma = metadata->max_luma, + .max_cll = metadata->max_cll, + .max_fall = metadata->max_fall, + }; + + // Ignore no-op changes + if (pl_hdr_metadata_equal(&fix, &p->hdr_metadata)) + return; + + // Remember the metadata so we can re-apply it after swapchain recreation + p->hdr_metadata = fix; + + // Ignore HDR metadata requests for SDR swapchains + if (!pl_color_transfer_is_hdr(p->color_space.transfer)) + return; + + if (!p->swapchain) + return; + + vk->SetHdrMetadataEXT(vk->dev, 1, &p->swapchain, &(VkHdrMetadataEXT) { + .sType = VK_STRUCTURE_TYPE_HDR_METADATA_EXT, + .displayPrimaryRed = { fix.prim.red.x, fix.prim.red.y }, + .displayPrimaryGreen = { fix.prim.green.x, fix.prim.green.y }, + .displayPrimaryBlue = { fix.prim.blue.x, fix.prim.blue.y }, + .whitePoint = { fix.prim.white.x, fix.prim.white.y }, + .maxLuminance = fix.max_luma, + .minLuminance = fix.min_luma, + .maxContentLightLevel = fix.max_cll, + .maxFrameAverageLightLevel = fix.max_fall, + }); + + // Keep track of applied HDR colorimetry metadata + p->color_space.hdr = p->hdr_metadata; +} + +pl_swapchain pl_vulkan_create_swapchain(pl_vulkan plvk, + const struct pl_vulkan_swapchain_params *params) +{ + struct vk_ctx *vk = PL_PRIV(plvk); + pl_gpu gpu = plvk->gpu; + + if (!vk->CreateSwapchainKHR) { + PL_ERR(gpu, VK_KHR_SWAPCHAIN_EXTENSION_NAME " not enabled!"); + return NULL; + } + + struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv); + sw->log = vk->log; + sw->gpu = gpu; + + struct priv *p = PL_PRIV(sw); + pl_mutex_init(&p->lock); + p->impl = vulkan_swapchain; + p->params = *params; + p->vk = vk; + p->surf = params->surface; + p->swapchain_depth = PL_DEF(params->swapchain_depth, 3); + pl_assert(p->swapchain_depth > 0); + atomic_init(&p->frames_in_flight, 0); + p->last_imgidx = -1; + p->protoInfo = (VkSwapchainCreateInfoKHR) { + .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR, + .surface = p->surf, + .imageArrayLayers = 1, // non-stereoscopic + .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, + .minImageCount = p->swapchain_depth + 1, // +1 for the FB + .presentMode = params->present_mode, + .clipped = true, + }; + + // These fields will be updated by `vk_sw_recreate` + p->color_space = pl_color_space_unknown; + p->color_repr = (struct pl_color_repr) { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_FULL, + .alpha = PL_ALPHA_UNKNOWN, + }; + + // Make sure the swapchain present mode is supported + VkPresentModeKHR *modes = NULL; + uint32_t num_modes = 0; + VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, NULL)); + modes = pl_calloc_ptr(NULL, num_modes, modes); + VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, modes)); + + bool supported = false; + for (int i = 0; i < num_modes; i++) + supported |= (modes[i] == p->protoInfo.presentMode); + pl_free_ptr(&modes); + + if (!supported) { + PL_WARN(vk, "Requested swap mode unsupported by this device, falling " + "back to VK_PRESENT_MODE_FIFO_KHR"); + p->protoInfo.presentMode = VK_PRESENT_MODE_FIFO_KHR; + } + + // Enumerate the supported surface color spaces + uint32_t num_formats = 0; + VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, NULL)); + PL_ARRAY_RESIZE(sw, p->formats, num_formats); + VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, p->formats.elem)); + p->formats.num = num_formats; + + PL_INFO(gpu, "Available surface configurations:"); + for (int i = 0; i < p->formats.num; i++) { + PL_INFO(gpu, " %d: %-40s %s", i, + vk_fmt_name(p->formats.elem[i].format), + vk_csp_name(p->formats.elem[i].colorSpace)); + } + + // Ensure there exists at least some valid renderable surface format + struct pl_color_space hint = {0}; + if (!pick_surf_format(sw, &hint)) + goto error; + + return sw; + +error: + pl_free(modes); + pl_free(sw); + return NULL; +} + +static void vk_sw_destroy(pl_swapchain sw) +{ + pl_gpu gpu = sw->gpu; + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + + pl_gpu_flush(gpu); + vk_wait_idle(vk); + + // Vulkan offers no way to know when a queue presentation command is done, + // leading to spec-mandated undefined behavior when destroying resources + // tied to the swapchain. Use an extra `vkQueueWaitIdle` on all of the + // queues we may have oustanding presentation calls on, to hopefully inform + // the driver that we want to wait until the device is truly idle. + for (int i = 0; i < vk->pool_graphics->num_queues; i++) + vk->QueueWaitIdle(vk->pool_graphics->queues[i]); + + for (int i = 0; i < p->images.num; i++) + pl_tex_destroy(gpu, &p->images.elem[i]); + for (int i = 0; i < p->sems.num; i++) { + vk->DestroySemaphore(vk->dev, p->sems.elem[i].in, PL_VK_ALLOC); + vk->DestroySemaphore(vk->dev, p->sems.elem[i].out, PL_VK_ALLOC); + } + + vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC); + pl_mutex_destroy(&p->lock); + pl_free((void *) sw); +} + +static int vk_sw_latency(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + return p->swapchain_depth; +} + +static bool update_swapchain_info(struct priv *p, VkSwapchainCreateInfoKHR *info, + int w, int h) +{ + struct vk_ctx *vk = p->vk; + + // Query the supported capabilities and update this struct as needed + VkSurfaceCapabilitiesKHR caps = {0}; + VK(vk->GetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, p->surf, &caps)); + + // Check for hidden/invisible window + if (!caps.currentExtent.width || !caps.currentExtent.height) { + PL_DEBUG(vk, "maxImageExtent reported as 0x0, hidden window? skipping"); + return false; + } + + // Sorted by preference + static const struct { VkCompositeAlphaFlagsKHR vk_mode; + enum pl_alpha_mode pl_mode; + } alphaModes[] = { + {VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, PL_ALPHA_INDEPENDENT}, + {VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR, PL_ALPHA_PREMULTIPLIED}, + {VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, PL_ALPHA_UNKNOWN}, + {VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR, PL_ALPHA_UNKNOWN}, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(alphaModes); i++) { + if (caps.supportedCompositeAlpha & alphaModes[i].vk_mode) { + info->compositeAlpha = alphaModes[i].vk_mode; + p->color_repr.alpha = alphaModes[i].pl_mode; + PL_DEBUG(vk, "Requested alpha compositing mode: %s", + vk_alpha_mode(info->compositeAlpha)); + break; + } + } + + if (!info->compositeAlpha) { + PL_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)", + caps.supportedCompositeAlpha); + goto error; + } + + // Note: We could probably also allow picking a surface transform that + // flips the framebuffer and set `pl_swapchain_frame.flipped`, but this + // doesn't appear to be necessary for any vulkan implementations. + static const VkSurfaceTransformFlagsKHR rotModes[] = { + VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR, + VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(rotModes); i++) { + if (caps.supportedTransforms & rotModes[i]) { + info->preTransform = rotModes[i]; + PL_DEBUG(vk, "Requested surface transform: %s", + vk_surface_transform(info->preTransform)); + break; + } + } + + if (!info->preTransform) { + PL_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)", + caps.supportedTransforms); + goto error; + } + + // Image count as required + PL_DEBUG(vk, "Requested image count: %d (min %d max %d)", + (int) info->minImageCount, (int) caps.minImageCount, + (int) caps.maxImageCount); + + info->minImageCount = PL_MAX(info->minImageCount, caps.minImageCount); + if (caps.maxImageCount) + info->minImageCount = PL_MIN(info->minImageCount, caps.maxImageCount); + + PL_DEBUG(vk, "Requested image size: %dx%d (min %dx%d < cur %dx%d < max %dx%d)", + w, h, caps.minImageExtent.width, caps.minImageExtent.height, + caps.currentExtent.width, caps.currentExtent.height, + caps.maxImageExtent.width, caps.maxImageExtent.height); + + // Default the requested size based on the reported extent + if (caps.currentExtent.width != 0xFFFFFFFF) + w = PL_DEF(w, caps.currentExtent.width); + if (caps.currentExtent.height != 0xFFFFFFFF) + h = PL_DEF(h, caps.currentExtent.height); + + // Otherwise, re-use the existing size if available + w = PL_DEF(w, info->imageExtent.width); + h = PL_DEF(h, info->imageExtent.height); + + if (!w || !h) { + PL_ERR(vk, "Failed resizing swapchain: unknown size?"); + goto error; + } + + // Clamp the extent based on the supported limits + w = PL_CLAMP(w, caps.minImageExtent.width, caps.maxImageExtent.width); + h = PL_CLAMP(h, caps.minImageExtent.height, caps.maxImageExtent.height); + info->imageExtent = (VkExtent2D) { w, h }; + + // We just request whatever makes sense, and let the pl_vk decide what + // pl_tex_params that translates to. That said, we still need to intersect + // the swapchain usage flags with the format usage flags + VkImageUsageFlags req_flags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT; + VkImageUsageFlags opt_flags = VK_IMAGE_USAGE_STORAGE_BIT; + + info->imageUsage = caps.supportedUsageFlags & (req_flags | opt_flags); + VkFormatProperties fmtprop = {0}; + vk->GetPhysicalDeviceFormatProperties(vk->physd, info->imageFormat, &fmtprop); + +#define CHECK(usage, feature) \ + if (!((fmtprop.optimalTilingFeatures & VK_FORMAT_FEATURE_##feature##_BIT))) \ + info->imageUsage &= ~VK_IMAGE_USAGE_##usage##_BIT + + CHECK(COLOR_ATTACHMENT, COLOR_ATTACHMENT); + CHECK(TRANSFER_DST, TRANSFER_DST); + CHECK(STORAGE, STORAGE_IMAGE); + + if ((info->imageUsage & req_flags) != req_flags) { + PL_ERR(vk, "The swapchain doesn't support rendering and blitting!"); + goto error; + } + + return true; + +error: + return false; +} + +static void destroy_swapchain(struct vk_ctx *vk, void *swapchain) +{ + vk->DestroySwapchainKHR(vk->dev, vk_unwrap_handle(swapchain), PL_VK_ALLOC); +} + +static bool vk_sw_recreate(pl_swapchain sw, int w, int h) +{ + pl_gpu gpu = sw->gpu; + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + + VkImage *vkimages = NULL; + uint32_t num_images = 0; + + if (!update_swapchain_info(p, &p->protoInfo, w, h)) + return false; + + VkSwapchainCreateInfoKHR sinfo = p->protoInfo; +#ifdef VK_EXT_full_screen_exclusive + // Explicitly disallow full screen exclusive mode if possible + static const VkSurfaceFullScreenExclusiveInfoEXT fsinfo = { + .sType = VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_INFO_EXT, + .fullScreenExclusive = VK_FULL_SCREEN_EXCLUSIVE_DISALLOWED_EXT, + }; + if (vk->AcquireFullScreenExclusiveModeEXT) + vk_link_struct(&sinfo, &fsinfo); +#endif + + p->suboptimal = false; + p->needs_recreate = false; + p->cur_width = sinfo.imageExtent.width; + p->cur_height = sinfo.imageExtent.height; + + PL_DEBUG(sw, "(Re)creating swapchain of size %dx%d", + sinfo.imageExtent.width, + sinfo.imageExtent.height); + +#ifdef PL_HAVE_UNIX + if (vk->props.vendorID == VK_VENDOR_ID_NVIDIA) { + vk->DeviceWaitIdle(vk->dev); + vk_wait_idle(vk); + } +#endif + + // Calling `vkCreateSwapchainKHR` puts sinfo.oldSwapchain into a retired + // state whether the call succeeds or not, so we always need to garbage + // collect it afterwards - asynchronously as it may still be in use + sinfo.oldSwapchain = p->swapchain; + p->swapchain = VK_NULL_HANDLE; + VkResult res = vk->CreateSwapchainKHR(vk->dev, &sinfo, PL_VK_ALLOC, &p->swapchain); + vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, vk_wrap_handle(sinfo.oldSwapchain)); + PL_VK_ASSERT(res, "vk->CreateSwapchainKHR(...)"); + + // Get the new swapchain images + VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, NULL)); + vkimages = pl_calloc_ptr(NULL, num_images, vkimages); + VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, vkimages)); + + for (int i = 0; i < num_images; i++) + PL_VK_NAME(IMAGE, vkimages[i], "swapchain"); + + // If needed, allocate some more semaphores + while (num_images > p->sems.num) { + VkSemaphore sem_in = VK_NULL_HANDLE, sem_out = VK_NULL_HANDLE; + static const VkSemaphoreCreateInfo seminfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_in)); + VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_out)); + PL_VK_NAME(SEMAPHORE, sem_in, "swapchain in"); + PL_VK_NAME(SEMAPHORE, sem_out, "swapchain out"); + + PL_ARRAY_APPEND(sw, p->sems, (struct sem_pair) { + .in = sem_in, + .out = sem_out, + }); + } + + // Recreate the pl_tex wrappers + for (int i = 0; i < p->images.num; i++) + pl_tex_destroy(gpu, &p->images.elem[i]); + p->images.num = 0; + + for (int i = 0; i < num_images; i++) { + const VkExtent2D *ext = &sinfo.imageExtent; + pl_tex tex = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params( + .image = vkimages[i], + .width = ext->width, + .height = ext->height, + .format = sinfo.imageFormat, + .usage = sinfo.imageUsage, + )); + if (!tex) + goto error; + PL_ARRAY_APPEND(sw, p->images, tex); + } + + pl_assert(num_images > 0); + int bits = 0; + + // The channel with the most bits is probably the most authoritative about + // the actual color information (consider e.g. a2bgr10). Slight downside + // in that it results in rounding r/b for e.g. rgb565, but we don't pick + // surfaces with fewer than 8 bits anyway, so let's not care for now. + pl_fmt fmt = p->images.elem[0]->params.format; + for (int i = 0; i < fmt->num_components; i++) + bits = PL_MAX(bits, fmt->component_depth[i]); + + p->color_repr.bits.sample_depth = bits; + p->color_repr.bits.color_depth = bits; + + // Note: `p->color_space.hdr` is (re-)applied by `set_hdr_metadata` + map_color_space(sinfo.imageColorSpace, &p->color_space); + + // Forcibly re-apply HDR metadata, bypassing the no-op check + struct pl_hdr_metadata metadata = p->hdr_metadata; + p->hdr_metadata = pl_hdr_metadata_empty; + set_hdr_metadata(p, &metadata); + + pl_free(vkimages); + return true; + +error: + PL_ERR(vk, "Failed (re)creating swapchain!"); + pl_free(vkimages); + vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC); + p->swapchain = VK_NULL_HANDLE; + p->cur_width = p->cur_height = 0; + return false; +} + +static bool vk_sw_start_frame(pl_swapchain sw, + struct pl_swapchain_frame *out_frame) +{ + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + pl_mutex_lock(&p->lock); + + bool recreate = !p->swapchain || p->needs_recreate; + if (p->suboptimal && !p->params.allow_suboptimal) + recreate = true; + + if (recreate && !vk_sw_recreate(sw, 0, 0)) { + pl_mutex_unlock(&p->lock); + return false; + } + + VkSemaphore sem_in = p->sems.elem[p->idx_sems].in; + PL_TRACE(vk, "vkAcquireNextImageKHR signals 0x%"PRIx64, (uint64_t) sem_in); + + for (int attempts = 0; attempts < 2; attempts++) { + uint32_t imgidx = 0; + VkResult res = vk->AcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX, + sem_in, VK_NULL_HANDLE, &imgidx); + + switch (res) { + case VK_SUBOPTIMAL_KHR: + p->suboptimal = true; + // fall through + case VK_SUCCESS: + p->last_imgidx = imgidx; + pl_vulkan_release_ex(sw->gpu, pl_vulkan_release_params( + .tex = p->images.elem[imgidx], + .layout = VK_IMAGE_LAYOUT_UNDEFINED, + .qf = VK_QUEUE_FAMILY_IGNORED, + .semaphore = { sem_in }, + )); + *out_frame = (struct pl_swapchain_frame) { + .fbo = p->images.elem[imgidx], + .flipped = false, + .color_repr = p->color_repr, + .color_space = p->color_space, + }; + // keep lock held + return true; + + case VK_ERROR_OUT_OF_DATE_KHR: { + // In these cases try recreating the swapchain + if (!vk_sw_recreate(sw, 0, 0)) { + pl_mutex_unlock(&p->lock); + return false; + } + continue; + } + + default: + PL_ERR(vk, "Failed acquiring swapchain image: %s", vk_res_str(res)); + pl_mutex_unlock(&p->lock); + return false; + } + } + + // If we've exhausted the number of attempts to recreate the swapchain, + // just give up silently and let the user retry some time later. + pl_mutex_unlock(&p->lock); + return false; +} + +static void present_cb(struct priv *p, void *arg) +{ + (void) pl_rc_deref(&p->frames_in_flight); +} + +static bool vk_sw_submit_frame(pl_swapchain sw) +{ + pl_gpu gpu = sw->gpu; + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + pl_assert(p->last_imgidx >= 0); + pl_assert(p->swapchain); + uint32_t idx = p->last_imgidx; + VkSemaphore sem_out = p->sems.elem[p->idx_sems++].out; + p->idx_sems %= p->sems.num; + p->last_imgidx = -1; + + bool held = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = p->images.elem[idx], + .layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .qf = VK_QUEUE_FAMILY_IGNORED, + .semaphore = { sem_out }, + )); + + if (!held) { + PL_ERR(gpu, "Failed holding swapchain image for presentation"); + pl_mutex_unlock(&p->lock); + return false; + } + + struct vk_cmd *cmd = pl_vk_steal_cmd(gpu); + if (!cmd) { + pl_mutex_unlock(&p->lock); + return false; + } + + pl_rc_ref(&p->frames_in_flight); + vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL); + if (!vk_cmd_submit(&cmd)) { + pl_mutex_unlock(&p->lock); + return false; + } + + struct vk_cmdpool *pool = vk->pool_graphics; + int qidx = pool->idx_queues; + VkQueue queue = pool->queues[qidx]; + + vk_rotate_queues(p->vk); + vk_malloc_garbage_collect(vk->ma); + + VkPresentInfoKHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &sem_out, + .swapchainCount = 1, + .pSwapchains = &p->swapchain, + .pImageIndices = &idx, + }; + + PL_TRACE(vk, "vkQueuePresentKHR waits on 0x%"PRIx64, (uint64_t) sem_out); + vk->lock_queue(vk->queue_ctx, pool->qf, qidx); + VkResult res = vk->QueuePresentKHR(queue, &pinfo); + vk->unlock_queue(vk->queue_ctx, pool->qf, qidx); + pl_mutex_unlock(&p->lock); + + switch (res) { + case VK_SUBOPTIMAL_KHR: + p->suboptimal = true; + // fall through + case VK_SUCCESS: + return true; + + case VK_ERROR_OUT_OF_DATE_KHR: + // We can silently ignore this error, since the next start_frame will + // recreate the swapchain automatically. + return true; + + default: + PL_ERR(vk, "Failed presenting to queue %p: %s", (void *) queue, + vk_res_str(res)); + return false; + } +} + +static void vk_sw_swap_buffers(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + + pl_mutex_lock(&p->lock); + while (pl_rc_count(&p->frames_in_flight) >= p->swapchain_depth) { + pl_mutex_unlock(&p->lock); // don't hold mutex while blocking + vk_poll_commands(p->vk, UINT64_MAX); + pl_mutex_lock(&p->lock); + } + pl_mutex_unlock(&p->lock); +} + +static bool vk_sw_resize(pl_swapchain sw, int *width, int *height) +{ + struct priv *p = PL_PRIV(sw); + bool ok = true; + + pl_mutex_lock(&p->lock); + + bool width_changed = *width && *width != p->cur_width, + height_changed = *height && *height != p->cur_height; + + if (p->suboptimal || p->needs_recreate || width_changed || height_changed) + ok = vk_sw_recreate(sw, *width, *height); + + *width = p->cur_width; + *height = p->cur_height; + + pl_mutex_unlock(&p->lock); + return ok; +} + +static void vk_sw_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp) +{ + struct priv *p = PL_PRIV(sw); + pl_mutex_lock(&p->lock); + + // This should never fail if the swapchain already exists + bool ok = pick_surf_format(sw, csp); + set_hdr_metadata(p, &csp->hdr); + pl_assert(ok); + + pl_mutex_unlock(&p->lock); +} + +bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + return p->suboptimal; +} + +static const struct pl_sw_fns vulkan_swapchain = { + .destroy = vk_sw_destroy, + .latency = vk_sw_latency, + .resize = vk_sw_resize, + .colorspace_hint = vk_sw_colorspace_hint, + .start_frame = vk_sw_start_frame, + .submit_frame = vk_sw_submit_frame, + .swap_buffers = vk_sw_swap_buffers, +}; diff --git a/src/vulkan/utils.c b/src/vulkan/utils.c new file mode 100644 index 0000000..914f9e4 --- /dev/null +++ b/src/vulkan/utils.c @@ -0,0 +1,181 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "utils.h" + +VkExternalMemoryHandleTypeFlagBitsKHR +vk_mem_handle_type(enum pl_handle_type handle_type) +{ + if (!handle_type) + return 0; + + switch (handle_type) { + case PL_HANDLE_FD: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + case PL_HANDLE_WIN32: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR; + case PL_HANDLE_WIN32_KMT: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR; + case PL_HANDLE_DMA_BUF: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT; + case PL_HANDLE_HOST_PTR: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + return 0; + } + + pl_unreachable(); +} + +VkExternalSemaphoreHandleTypeFlagBitsKHR +vk_sync_handle_type(enum pl_handle_type handle_type) +{ + if (!handle_type) + return 0; + + switch (handle_type) { + case PL_HANDLE_FD: + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + case PL_HANDLE_WIN32: + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR; + case PL_HANDLE_WIN32_KMT: + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR; + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + return 0; + } + + pl_unreachable(); +} + +bool vk_external_mem_check(struct vk_ctx *vk, + const VkExternalMemoryPropertiesKHR *props, + enum pl_handle_type handle_type, + bool import) +{ + VkExternalMemoryFeatureFlagsKHR flags = props->externalMemoryFeatures; + VkExternalMemoryHandleTypeFlagBitsKHR vk_handle = vk_mem_handle_type(handle_type); + + if (import) { + if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR)) { + PL_DEBUG(vk, "Handle type %s (0x%x) is not importable", + vk_handle_name(vk_handle), (unsigned int) handle_type); + return false; + } + } else { + if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR)) { + PL_DEBUG(vk, "Handle type %s (0x%x) is not exportable", + vk_handle_name(vk_handle), (unsigned int) handle_type); + return false; + } + } + + return true; +} + +const enum pl_handle_type vk_mem_handle_list[] = { + PL_HANDLE_HOST_PTR, +#ifdef PL_HAVE_UNIX + PL_HANDLE_FD, + PL_HANDLE_DMA_BUF, +#endif +#ifdef PL_HAVE_WIN32 + PL_HANDLE_WIN32, + PL_HANDLE_WIN32_KMT, +#endif + 0 +}; + +const enum pl_handle_type vk_sync_handle_list[] = { +#ifdef PL_HAVE_UNIX + PL_HANDLE_FD, +#endif +#ifdef PL_HAVE_WIN32 + PL_HANDLE_WIN32, + PL_HANDLE_WIN32_KMT, +#endif + 0 +}; + +const void *vk_find_struct(const void *chain, VkStructureType stype) +{ + const VkBaseInStructure *in = chain; + while (in) { + if (in->sType == stype) + return in; + + in = in->pNext; + } + + return NULL; +} + +void vk_link_struct(void *chain, const void *in) +{ + if (!in) + return; + + VkBaseOutStructure *out = chain; + while (out->pNext) + out = out->pNext; + + out->pNext = (void *) in; +} + +void *vk_struct_memdup(void *alloc, const void *pin) +{ + if (!pin) + return NULL; + + const VkBaseInStructure *in = pin; + size_t size = vk_struct_size(in->sType); + pl_assert(size); + + VkBaseOutStructure *out = pl_memdup(alloc, in, size); + out->pNext = NULL; + return out; +} + +void *vk_chain_memdup(void *alloc, const void *pin) +{ + if (!pin) + return NULL; + + const VkBaseInStructure *in = pin; + VkBaseOutStructure *out = vk_struct_memdup(alloc, in); + pl_assert(out); + + out->pNext = vk_chain_memdup(alloc, in->pNext); + return out; +} + +void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype) +{ + for (VkBaseOutStructure *out = chain;; out = out->pNext) { + if (out->sType == stype) + return out; + if (!out->pNext) { + VkBaseOutStructure *s = pl_zalloc(alloc, vk_struct_size(stype)); + s->sType = stype; + out->pNext = s; + return s; + } + } +} diff --git a/src/vulkan/utils.h b/src/vulkan/utils.h new file mode 100644 index 0000000..cb1c5f5 --- /dev/null +++ b/src/vulkan/utils.h @@ -0,0 +1,136 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +// Return a human-readable name for various vulkan enums +const char *vk_res_str(VkResult res); +const char *vk_fmt_name(VkFormat fmt); +const char *vk_csp_name(VkColorSpaceKHR csp); +const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle); +const char *vk_obj_type(VkObjectType obj); +const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha); +const char *vk_surface_transform(VkSurfaceTransformFlagsKHR transform); + +// Return the size of an arbitrary vulkan struct. Returns 0 for unknown structs +size_t vk_struct_size(VkStructureType stype); + +// Returns the vulkan API version which a given extension was promoted to, or 0 +// if the extension is not promoted. +uint32_t vk_ext_promoted_ver(const char *extension); + +// Enum translation boilerplate +VkExternalMemoryHandleTypeFlagBitsKHR vk_mem_handle_type(enum pl_handle_type); +VkExternalSemaphoreHandleTypeFlagBitsKHR vk_sync_handle_type(enum pl_handle_type); + +// Bitmask of all access flags that imply a read/write operation, respectively +extern const VkAccessFlags2 vk_access_read; +extern const VkAccessFlags2 vk_access_write; + +// Check for compatibility of a VkExternalMemoryProperties +bool vk_external_mem_check(struct vk_ctx *vk, + const VkExternalMemoryPropertiesKHR *props, + enum pl_handle_type handle_type, + bool check_import); + +// Static lists of external handle types we should try probing for +extern const enum pl_handle_type vk_mem_handle_list[]; +extern const enum pl_handle_type vk_sync_handle_list[]; + +// Find a structure in a pNext chain, or NULL +const void *vk_find_struct(const void *chain, VkStructureType stype); + +// Link a structure into a pNext chain +void vk_link_struct(void *chain, const void *in); + +// Make a copy of a structure, not including the pNext chain +void *vk_struct_memdup(void *alloc, const void *in); + +// Make a deep copy of an entire pNext chain +void *vk_chain_memdup(void *alloc, const void *in); + +// Find a structure in a pNext chain, or allocate + link it if absent. +void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype); + +// Renormalize input features into a state consistent for a given API version. +// If `api_ver` is specified as 0, *both* meta-structs and extension structs +// will be emitted. Note: `out` should be initialized by the user. In +// particular, if it already contains a valid features chain, then this +// function will effectively act as a union. +void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *in, + uint32_t api_ver, VkPhysicalDeviceFeatures2 *out); + +// Convenience macros to simplify a lot of common boilerplate +#define PL_VK_ASSERT(res, str) \ + do { \ + if (res != VK_SUCCESS) { \ + PL_ERR(vk, str ": %s (%s:%d)", \ + vk_res_str(res), __FILE__, __LINE__); \ + goto error; \ + } \ + } while (0) + +#define VK(cmd) \ + do { \ + PL_TRACE(vk, #cmd); \ + VkResult _res = (cmd); \ + PL_VK_ASSERT(_res, #cmd); \ + } while (0) + +#define PL_VK_NAME(type, obj, name) \ + do { \ + if (vk->SetDebugUtilsObjectNameEXT) { \ + vk->SetDebugUtilsObjectNameEXT(vk->dev, &(VkDebugUtilsObjectNameInfoEXT) { \ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, \ + .objectType = VK_OBJECT_TYPE_##type, \ + .objectHandle = (uint64_t) (obj), \ + .pObjectName = (name), \ + }); \ + } \ + } while (0) + +// Variant of PL_VK_NAME for dispatchable handles +#define PL_VK_NAME_HANDLE(type, obj, name) \ + PL_VK_NAME(type, (uintptr_t) (obj), name) + +// Helper functions to wrap and unwrap non-dispatchable handles into pointers. +// Note that wrap/unwrap must always be used linearly. +#if VK_USE_64_BIT_PTR_DEFINES == 1 +#define vk_wrap_handle(h) (h) +#define vk_unwrap_handle(h) (h) +#elif UINTPTR_MAX >= UINT64_MAX +#define vk_wrap_handle(h) ((void *) (uintptr_t) (h)) +#define vk_unwrap_handle(h) ((uint64_t) (uintptr_t) (h)) +#else +static inline void *vk_wrap_handle(uint64_t h) +{ + uint64_t *wrapper = malloc(sizeof(h)); + assert(wrapper); + *wrapper = h; + return wrapper; +} + +static inline uint64_t vk_unwrap_handle(void *h) +{ + uint64_t *wrapper = h; + uint64_t ret = *wrapper; + free(wrapper); + return ret; +} +#endif diff --git a/src/vulkan/utils_gen.c.j2 b/src/vulkan/utils_gen.c.j2 new file mode 100644 index 0000000..6db0454 --- /dev/null +++ b/src/vulkan/utils_gen.c.j2 @@ -0,0 +1,137 @@ +#define VK_ENABLE_BETA_EXTENSIONS +#include "vulkan/utils.h" + +const char *vk_res_str(VkResult res) +{ + switch (res) { +{% for res in vkresults %} + case {{ res }}: return "{{ res }}"; +{% endfor %} + + default: return "unknown error"; + } +} + +const char *vk_fmt_name(VkFormat fmt) +{ + switch (fmt) { +{% for fmt in vkformats %} + case {{ fmt }}: return "{{ fmt }}"; +{% endfor %} + + default: return "unknown format"; + } +} + +const char *vk_csp_name(VkColorSpaceKHR csp) +{ + switch (csp) { +{% for csp in vkspaces %} + case {{ csp }}: return "{{ csp }}"; +{% endfor %} + + default: return "unknown color space"; + } +} + +const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle) +{ + switch (handle) { +{% for handle in vkhandles %} + case {{ handle }}: return "{{ handle }}"; +{% endfor %} + + default: return "unknown handle type"; + } +} + +const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha) +{ + switch (alpha) { +{% for mode in vkalphas %} + case {{ mode }}: return "{{ mode }}"; +{% endfor %} + + default: return "unknown alpha mode"; + } +} + +const char *vk_surface_transform(VkSurfaceTransformFlagsKHR tf) +{ + switch (tf) { +{% for tf in vktransforms %} + case {{ tf }}: return "{{ tf }}"; +{% endfor %} + + default: return "unknown surface transform"; + } +} + + +const char *vk_obj_type(VkObjectType obj) +{ + switch (obj) { +{% for obj in vkobjects %} + case {{ obj.enum }}: return "{{ obj.name }}"; +{% endfor %} + + default: return "unknown object"; + } +} + +size_t vk_struct_size(VkStructureType stype) +{ + switch (stype) { +{% for struct in vkstructs %} + case {{ struct.stype }}: return sizeof({{ struct.name }}); +{% endfor %} + + default: return 0; + } +} + +uint32_t vk_ext_promoted_ver(const char *extension) +{ +{% for ext in vkexts %} +{% if ext.promoted_ver %} + if (!strcmp(extension, "{{ ext.name }}")) + return {{ ext.promoted_ver }}; +{% endif %} +{% endfor %} + return 0; +} + +void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *fin, + uint32_t api_ver, VkPhysicalDeviceFeatures2 *out) +{ + for (const VkBaseInStructure *in = (void *) fin; in; in = in->pNext) { + switch (in->sType) { + default: break; +{% for fs in vkfeatures %} + case {{ fs.stype }}: { + const {{ fs.name }} *i = (const void *) in; +{% for f in fs.features %} + if (i->{{ f.name }}) { +{% for r in f.replacements %} +{% if r.core_ver %} + if (!api_ver || api_ver >= {{ r.core_ver }}) +{% elif r.max_ver %} + if (!api_ver || api_ver < {{ r.max_ver }}) +{% endif %} +{% if fs.is_base %} + out->{{ f.name }} = true; +{% else %} + (({{ r.name }} *) vk_chain_alloc(alloc, out, {{ r.stype }}))->{{ f.name }} = true; +{% endif %} +{% endfor %} + } +{% endfor %} + break; + } +{% endfor %} + } + } +} + +const VkAccessFlags2 vk_access_read = {{ '0x%x' % vkaccess.read }}LLU; +const VkAccessFlags2 vk_access_write = {{ '0x%x' % vkaccess.write }}LLU; diff --git a/src/vulkan/utils_gen.py b/src/vulkan/utils_gen.py new file mode 100644 index 0000000..a8652fd --- /dev/null +++ b/src/vulkan/utils_gen.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +# +# This file is part of libplacebo. +# +# libplacebo is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# libplacebo is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + +import os.path +import re +import sys +import xml.etree.ElementTree as ET + +try: + import jinja2 +except ModuleNotFoundError: + print('Module \'jinja2\' not found, please install \'python3-Jinja2\' or ' + 'an equivalent package on your system! Alternatively, run ' + '`git submodule update --init` followed by `meson --wipe`.', + file=sys.stderr) + sys.exit(1) + +TEMPLATE = jinja2.Environment( + loader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)), + trim_blocks=True, +).get_template('utils_gen.c.j2') + +class Obj(object): + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + +class VkXML(ET.ElementTree): + def blacklist_block(self, req): + for t in req.iterfind('type'): + self.blacklist_types.add(t.attrib['name']) + for e in req.iterfind('enum'): + self.blacklist_enums.add(e.attrib['name']) + + def __init__(self, *args, **kwargs): + + super().__init__(*args, **kwargs) + self.blacklist_types = set() + self.blacklist_enums = set() + + for f in self.iterfind('feature'): + # Feature block for non-Vulkan API + if not 'vulkan' in f.attrib['api'].split(','): + for r in f.iterfind('require'): + self.blacklist_block(r) + + for e in self.iterfind('extensions/extension'): + # Entire extension is unsupported on vulkan or platform-specifid + if not 'vulkan' in e.attrib['supported'].split(',') or 'platform' in e.attrib: + for r in e.iterfind('require'): + self.blacklist_block(r) + continue + + # Only individual <require> blocks are API-specific + for r in e.iterfind('require[@api]'): + if not 'vulkan' in r.attrib['api'].split(','): + self.blacklist_block(r) + + def findall_enum(self, name): + for e in self.iterfind('enums[@name="{0}"]/enum'.format(name)): + if not 'alias' in e.attrib: + if not e.attrib['name'] in self.blacklist_enums: + yield e + for e in self.iterfind('.//enum[@extends="{0}"]'.format(name)): + if not 'alias' in e.attrib: + if not e.attrib['name'] in self.blacklist_enums: + yield e + + def findall_type(self, category): + for t in self.iterfind('types/type[@category="{0}"]'.format(category)): + name = t.attrib.get('name') or t.find('name').text + if name in self.blacklist_types: + continue + yield t + + +def get_vkenum(registry, enum): + for e in registry.findall_enum(enum): + yield e.attrib['name'] + +def get_vkobjects(registry): + for t in registry.findall_type('handle'): + if 'objtypeenum' in t.attrib: + yield Obj(enum = t.attrib['objtypeenum'], + name = t.find('name').text) + +def get_vkstructs(registry): + for t in registry.findall_type('struct'): + stype = None + for m in t.iterfind('member'): + if m.find('name').text == 'sType': + stype = m + break + + if stype is not None and 'values' in stype.attrib: + yield Obj(stype = stype.attrib['values'], + name = t.attrib['name']) + +def get_vkaccess(registry): + access = Obj(read = 0, write = 0) + for e in registry.findall_enum('VkAccessFlagBits2'): + if '_READ_' in e.attrib['name']: + access.read |= 1 << int(e.attrib['bitpos']) + if '_WRITE_' in e.attrib['name']: + access.write |= 1 << int(e.attrib['bitpos']) + return access + +def get_vkexts(registry): + for e in registry.iterfind('extensions/extension'): + promoted_ver = None + if res := re.match(r'VK_VERSION_(\d)_(\d)', e.attrib.get('promotedto', '')): + promoted_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2]) + yield Obj(name = e.attrib['name'], + promoted_ver = promoted_ver) + +def get_vkfeatures(registry): + structs = []; + featuremap = {}; # features -> [struct] + for t in registry.findall_type('struct'): + sname = t.attrib['name'] + is_base = sname == 'VkPhysicalDeviceFeatures' + extends = t.attrib.get('structextends', []) + if is_base: + sname = 'VkPhysicalDeviceFeatures2' + stype = 'VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2' + elif not 'VkPhysicalDeviceFeatures2' in extends: + continue + + features = [] + for f in t.iterfind('member'): + if f.find('type').text == 'VkStructureType': + stype = f.attrib['values'] + elif f.find('type').text == 'VkBool32': + fname = f.find('name').text + if is_base: + fname = 'features.' + fname + features.append(Obj(name = fname)) + + core_ver = None + if res := re.match(r'VkPhysicalDeviceVulkan(\d)(\d)Features', sname): + core_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2]) + + struct = Obj(name = sname, + stype = stype, + core_ver = core_ver, + is_base = is_base, + features = features) + + structs.append(struct) + for f in features: + featuremap.setdefault(f.name, []).append(struct) + + for s in structs: + for f in s.features: + f.replacements = featuremap[f.name] + core_ver = next(( r.core_ver for r in f.replacements if r.core_ver ), None) + for r in f.replacements: + if not r.core_ver: + r.max_ver = core_ver + + yield from structs + +def find_registry_xml(datadir): + registry_paths = [ + '{0}/vulkan/registry/vk.xml'.format(datadir), + '$MINGW_PREFIX/share/vulkan/registry/vk.xml', + '%VULKAN_SDK%/share/vulkan/registry/vk.xml', + '$VULKAN_SDK/share/vulkan/registry/vk.xml', + '/usr/share/vulkan/registry/vk.xml', + ] + + for p in registry_paths: + path = os.path.expandvars(p) + if os.path.isfile(path): + print('Found vk.xml: {0}'.format(path)) + return path + + print('Could not find the vulkan registry (vk.xml), please specify its ' + 'location manually using the -Dvulkan-registry=/path/to/vk.xml ' + 'option!', file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + assert len(sys.argv) == 4 + datadir = sys.argv[1] + xmlfile = sys.argv[2] + outfile = sys.argv[3] + + if not xmlfile or xmlfile == '': + xmlfile = find_registry_xml(datadir) + + registry = VkXML(ET.parse(xmlfile)) + with open(outfile, 'w') as f: + f.write(TEMPLATE.render( + vkresults = get_vkenum(registry, 'VkResult'), + vkformats = get_vkenum(registry, 'VkFormat'), + vkspaces = get_vkenum(registry, 'VkColorSpaceKHR'), + vkhandles = get_vkenum(registry, 'VkExternalMemoryHandleTypeFlagBits'), + vkalphas = get_vkenum(registry, 'VkCompositeAlphaFlagBitsKHR'), + vktransforms = get_vkenum(registry, 'VkSurfaceTransformFlagBitsKHR'), + vkobjects = get_vkobjects(registry), + vkstructs = get_vkstructs(registry), + vkaccess = get_vkaccess(registry), + vkexts = get_vkexts(registry), + vkfeatures = get_vkfeatures(registry), + )) |