summaryrefslogtreecommitdiffstats
path: root/src/shaders
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/shaders.c992
-rw-r--r--src/shaders.h387
-rw-r--r--src/shaders/colorspace.c2120
-rw-r--r--src/shaders/custom.c89
-rw-r--r--src/shaders/custom_mpv.c1768
-rw-r--r--src/shaders/deinterlacing.c260
-rw-r--r--src/shaders/dithering.c527
-rw-r--r--src/shaders/film_grain.c65
-rw-r--r--src/shaders/film_grain.h75
-rw-r--r--src/shaders/film_grain_av1.c1001
-rw-r--r--src/shaders/film_grain_h274.c815
-rw-r--r--src/shaders/icc.c781
-rw-r--r--src/shaders/lut.c820
-rw-r--r--src/shaders/meson.build23
-rw-r--r--src/shaders/sampling.c1198
15 files changed, 10921 insertions, 0 deletions
diff --git a/src/shaders.c b/src/shaders.c
new file mode 100644
index 0000000..503ea78
--- /dev/null
+++ b/src/shaders.c
@@ -0,0 +1,992 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+#include "shaders.h"
+
+pl_shader_info pl_shader_info_ref(pl_shader_info pinfo)
+{
+ struct sh_info *info = (struct sh_info *) pinfo;
+ if (!info)
+ return NULL;
+
+ pl_rc_ref(&info->rc);
+ return &info->info;
+}
+
+void pl_shader_info_deref(pl_shader_info *pinfo)
+{
+ struct sh_info *info = (struct sh_info *) *pinfo;
+ if (!info)
+ return;
+
+ if (pl_rc_deref(&info->rc))
+ pl_free(info);
+ *pinfo = NULL;
+}
+
+static struct sh_info *sh_info_alloc(void *alloc)
+{
+ struct sh_info *info = pl_zalloc_ptr(alloc, info);
+ info->tmp = pl_tmp(info);
+ pl_rc_init(&info->rc);
+ return info;
+}
+
+// Re-use `sh_info` allocation if possible, allocate new otherwise
+static struct sh_info *sh_info_recycle(struct sh_info *info)
+{
+ if (!pl_rc_deref(&info->rc))
+ return sh_info_alloc(NULL);
+
+ memset(&info->info, 0, sizeof(info->info)); // reset public fields
+ pl_free_children(info->tmp);
+ pl_rc_ref(&info->rc);
+ info->desc.len = 0;
+ info->steps.num = 0;
+ return info;
+}
+
+static uint8_t reverse_bits(uint8_t x)
+{
+ static const uint8_t reverse_nibble[16] = {
+ 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+ 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf,
+ };
+
+ return reverse_nibble[x & 0xF] << 4 | reverse_nibble[x >> 4];
+}
+
+static void init_shader(pl_shader sh, const struct pl_shader_params *params)
+{
+ if (params) {
+ sh->info->info.params = *params;
+
+ // To avoid collisions for shaders with very high number of
+ // identifiers, pack the shader ID into the highest bits (MSB -> LSB)
+ pl_static_assert(sizeof(sh->prefix) > sizeof(params->id));
+ const int shift = 8 * (sizeof(sh->prefix) - sizeof(params->id));
+ sh->prefix = reverse_bits(params->id) << shift;
+ }
+
+ sh->name = sh_fresh(sh, "main");
+}
+
+pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params)
+{
+ static const int glsl_ver_req = 130;
+ if (params && params->glsl.version && params->glsl.version < 130) {
+ pl_err(log, "Requested GLSL version %d too low (required: %d)",
+ params->glsl.version, glsl_ver_req);
+ return NULL;
+ }
+
+ pl_shader sh = pl_alloc_ptr(NULL, sh);
+ *sh = (struct pl_shader_t) {
+ .log = log,
+ .tmp = pl_tmp(sh),
+ .info = sh_info_alloc(NULL),
+ .mutable = true,
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(sh->buffers); i++)
+ sh->buffers[i] = pl_str_builder_alloc(sh);
+
+ init_shader(sh, params);
+ return sh;
+}
+
+static void sh_obj_deref(pl_shader_obj obj);
+
+void sh_deref(pl_shader sh)
+{
+ pl_free_children(sh->tmp);
+
+ for (int i = 0; i < sh->obj.num; i++)
+ sh_obj_deref(sh->obj.elem[i]);
+ sh->obj.num = 0;
+}
+
+void pl_shader_free(pl_shader *psh)
+{
+ pl_shader sh = *psh;
+ if (!sh)
+ return;
+
+ sh_deref(sh);
+ pl_shader_info_deref((pl_shader_info *) &sh->info);
+ pl_free_ptr(psh);
+}
+
+void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params)
+{
+ sh_deref(sh);
+
+ struct pl_shader_t new = {
+ .log = sh->log,
+ .tmp = sh->tmp,
+ .info = sh_info_recycle(sh->info),
+ .data.buf = sh->data.buf,
+ .mutable = true,
+
+ // Preserve array allocations
+ .obj.elem = sh->obj.elem,
+ .vas.elem = sh->vas.elem,
+ .vars.elem = sh->vars.elem,
+ .descs.elem = sh->descs.elem,
+ .consts.elem = sh->consts.elem,
+ };
+
+ // Preserve buffer allocations
+ memcpy(new.buffers, sh->buffers, sizeof(new.buffers));
+ for (int i = 0; i < PL_ARRAY_SIZE(new.buffers); i++)
+ pl_str_builder_reset(new.buffers[i]);
+
+ *sh = new;
+ init_shader(sh, params);
+}
+
+static void *sh_alloc(pl_shader sh, size_t size, size_t align)
+{
+ const size_t offset = PL_ALIGN2(sh->data.len, align);
+ const size_t req_size = offset + size;
+ if (req_size <= pl_get_size(sh->data.buf)) {
+ sh->data.len = offset + size;
+ return sh->data.buf + offset;
+ }
+
+ // We can't realloc this buffer because various pointers will be left
+ // dangling, so just reparent it onto `sh->tmp` (so it will be cleaned
+ // up when the shader is next reset) and allocate a new, larger buffer
+ // in its place
+ const size_t new_size = PL_MAX(req_size << 1, 256);
+ pl_steal(sh->tmp, sh->data.buf);
+ sh->data.buf = pl_alloc(sh, new_size);
+ sh->data.len = size;
+ return sh->data.buf;
+}
+
+static void *sh_memdup(pl_shader sh, const void *data, size_t size, size_t align)
+{
+ if (!size)
+ return NULL;
+
+ void *dst = sh_alloc(sh, size, align);
+ assert(data);
+ memcpy(dst, data, size);
+ return dst;
+}
+
+bool pl_shader_is_failed(const pl_shader sh)
+{
+ return sh->failed;
+}
+
+struct pl_glsl_version sh_glsl(const pl_shader sh)
+{
+ if (SH_PARAMS(sh).glsl.version)
+ return SH_PARAMS(sh).glsl;
+
+ if (SH_GPU(sh))
+ return SH_GPU(sh)->glsl;
+
+ return (struct pl_glsl_version) { .version = 130 };
+}
+
+bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem)
+{
+ pl_assert(bw && bh);
+ int *sh_bw = &sh->group_size[0];
+ int *sh_bh = &sh->group_size[1];
+
+ struct pl_glsl_version glsl = sh_glsl(sh);
+ if (!glsl.compute) {
+ PL_TRACE(sh, "Disabling compute shader due to missing `compute` support");
+ return false;
+ }
+
+ if (sh->shmem + mem > glsl.max_shmem_size) {
+ PL_TRACE(sh, "Disabling compute shader due to insufficient shmem");
+ return false;
+ }
+
+ if (sh->type == SH_FRAGMENT) {
+ PL_TRACE(sh, "Disabling compute shader because shader is already marked "
+ "as fragment shader");
+ return false;
+ }
+
+ if (bw > glsl.max_group_size[0] ||
+ bh > glsl.max_group_size[1] ||
+ (bw * bh) > glsl.max_group_threads)
+ {
+ if (!flex) {
+ PL_TRACE(sh, "Disabling compute shader due to exceeded group "
+ "thread count.");
+ return false;
+ } else {
+ // Pick better group sizes
+ bw = PL_MIN(bw, glsl.max_group_size[0]);
+ bh = glsl.max_group_threads / bw;
+ }
+ }
+
+ sh->shmem += mem;
+
+ // If the current shader is either not a compute shader, or we have no
+ // choice but to override the metadata, always do so
+ if (sh->type != SH_COMPUTE || (sh->flexible_work_groups && !flex)) {
+ *sh_bw = bw;
+ *sh_bh = bh;
+ sh->type = SH_COMPUTE;
+ sh->flexible_work_groups = flex;
+ return true;
+ }
+
+ // If both shaders are flexible, pick the larger of the two
+ if (sh->flexible_work_groups && flex) {
+ *sh_bw = PL_MAX(*sh_bw, bw);
+ *sh_bh = PL_MAX(*sh_bh, bh);
+ pl_assert(*sh_bw * *sh_bh <= glsl.max_group_threads);
+ return true;
+ }
+
+ // At this point we're looking only at a non-flexible compute shader
+ pl_assert(sh->type == SH_COMPUTE && !sh->flexible_work_groups);
+ if (!flex) {
+ // Ensure parameters match
+ if (bw != *sh_bw || bh != *sh_bh) {
+ PL_TRACE(sh, "Disabling compute shader due to incompatible group "
+ "sizes %dx%d and %dx%d", *sh_bw, *sh_bh, bw, bh);
+ sh->shmem -= mem;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool pl_shader_is_compute(const pl_shader sh)
+{
+ return sh->type == SH_COMPUTE;
+}
+
+bool pl_shader_output_size(const pl_shader sh, int *w, int *h)
+{
+ if (!sh->output_w || !sh->output_h)
+ return false;
+
+ *w = sh->transpose ? sh->output_h : sh->output_w;
+ *h = sh->transpose ? sh->output_w : sh->output_h;
+ return true;
+}
+
+ident_t sh_fresh(pl_shader sh, const char *name)
+{
+ unsigned short id = ++sh->fresh;
+ assert(!(sh->prefix & id));
+ id |= sh->prefix;
+
+ assert(name);
+ return sh_mkident(id, name);
+}
+
+static inline ident_t sh_fresh_name(pl_shader sh, const char **pname)
+{
+ ident_t id = sh_fresh(sh, *pname);
+ *pname = sh_ident_pack(id);
+ return id;
+}
+
+ident_t sh_var(pl_shader sh, struct pl_shader_var sv)
+{
+ ident_t id = sh_fresh_name(sh, &sv.var.name);
+ struct pl_var_layout layout = pl_var_host_layout(0, &sv.var);
+ sv.data = sh_memdup(sh, sv.data, layout.size, layout.stride);
+ PL_ARRAY_APPEND(sh, sh->vars, sv);
+ return id;
+}
+
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_int(name),
+ .data = &val,
+ .dynamic = dynamic,
+ });
+}
+
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_uint(name),
+ .data = &val,
+ .dynamic = dynamic,
+ });
+}
+
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float(name),
+ .data = &val,
+ .dynamic = dynamic,
+ });
+}
+
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3(name),
+ .data = PL_TRANSPOSE_3X3(val.m),
+ });
+}
+
+ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd)
+{
+ switch (sd.desc.type) {
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE:
+ for (int i = 0; i < sh->descs.num; i++) // ensure uniqueness
+ pl_assert(sh->descs.elem[i].binding.object != sd.binding.object);
+ size_t bsize = sizeof(sd.buffer_vars[0]) * sd.num_buffer_vars;
+ sd.buffer_vars = sh_memdup(sh, sd.buffer_vars, bsize,
+ alignof(struct pl_buffer_var));
+ for (int i = 0; i < sd.num_buffer_vars; i++) {
+ struct pl_var *bv = &sd.buffer_vars[i].var;
+ const char *name = bv->name;
+ GLSLP("#define %s "$"\n", name, sh_fresh_name(sh, &bv->name));
+ }
+ break;
+
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG:
+ pl_assert(!sd.num_buffer_vars);
+ break;
+
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ ident_t id = sh_fresh_name(sh, &sd.desc.name);
+ PL_ARRAY_APPEND(sh, sh->descs, sd);
+ return id;
+}
+
+ident_t sh_const(pl_shader sh, struct pl_shader_const sc)
+{
+ if (SH_PARAMS(sh).dynamic_constants && !sc.compile_time) {
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = {
+ .name = sc.name,
+ .type = sc.type,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = 1,
+ },
+ .data = sc.data,
+ });
+ }
+
+ ident_t id = sh_fresh_name(sh, &sc.name);
+
+ pl_gpu gpu = SH_GPU(sh);
+ if (gpu && gpu->limits.max_constants) {
+ if (!sc.compile_time || gpu->limits.array_size_constants) {
+ size_t size = pl_var_type_size(sc.type);
+ sc.data = sh_memdup(sh, sc.data, size, size);
+ PL_ARRAY_APPEND(sh, sh->consts, sc);
+ return id;
+ }
+ }
+
+ // Fallback for GPUs without specialization constants
+ switch (sc.type) {
+ case PL_VAR_SINT:
+ GLSLH("const int "$" = %d; \n", id, *(int *) sc.data);
+ return id;
+ case PL_VAR_UINT:
+ GLSLH("const uint "$" = uint(%u); \n", id, *(unsigned int *) sc.data);
+ return id;
+ case PL_VAR_FLOAT:
+ GLSLH("const float "$" = float(%f); \n", id, *(float *) sc.data);
+ return id;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+ident_t sh_const_int(pl_shader sh, const char *name, int val)
+{
+ return sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .name = name,
+ .data = &val,
+ });
+}
+
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val)
+{
+ return sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_UINT,
+ .name = name,
+ .data = &val,
+ });
+}
+
+ident_t sh_const_float(pl_shader sh, const char *name, float val)
+{
+ return sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_FLOAT,
+ .name = name,
+ .data = &val,
+ });
+}
+
+ident_t sh_attr(pl_shader sh, struct pl_shader_va sva)
+{
+ const size_t vsize = sva.attr.fmt->texel_size;
+ uint8_t *data = sh_alloc(sh, vsize * 4, vsize);
+ for (int i = 0; i < 4; i++) {
+ memcpy(data, sva.data[i], vsize);
+ sva.data[i] = data;
+ data += vsize;
+ }
+
+ ident_t id = sh_fresh_name(sh, &sva.attr.name);
+ PL_ARRAY_APPEND(sh, sh->vas, sva);
+ return id;
+}
+
+ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc)
+{
+ pl_gpu gpu = SH_GPU(sh);
+ if (!gpu) {
+ SH_FAIL(sh, "Failed adding vertex attr '%s': No GPU available!", name);
+ return NULL_IDENT;
+ }
+
+ pl_fmt fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2);
+ if (!fmt) {
+ SH_FAIL(sh, "Failed adding vertex attr '%s': no vertex fmt!", name);
+ return NULL_IDENT;
+ }
+
+ float verts[4][2] = {
+ { rc->x0, rc->y0 },
+ { rc->x1, rc->y0 },
+ { rc->x0, rc->y1 },
+ { rc->x1, rc->y1 },
+ };
+
+ return sh_attr(sh, (struct pl_shader_va) {
+ .attr = {
+ .name = name,
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+ },
+ .data = { verts[0], verts[1], verts[2], verts[3] },
+ });
+}
+
+ident_t sh_bind(pl_shader sh, pl_tex tex,
+ enum pl_tex_address_mode address_mode,
+ enum pl_tex_sample_mode sample_mode,
+ const char *name, const pl_rect2df *rect,
+ ident_t *out_pos, ident_t *out_pt)
+{
+ if (pl_tex_params_dimension(tex->params) != 2) {
+ SH_FAIL(sh, "Failed binding texture '%s': not a 2D texture!", name);
+ return NULL_IDENT;
+ }
+
+ if (!tex->params.sampleable) {
+ SH_FAIL(sh, "Failed binding texture '%s': texture not sampleable!", name);
+ return NULL_IDENT;
+ }
+
+ ident_t itex = sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = name,
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ .binding = {
+ .object = tex,
+ .address_mode = address_mode,
+ .sample_mode = sample_mode,
+ },
+ });
+
+ float sx, sy;
+ if (tex->sampler_type == PL_SAMPLER_RECT) {
+ sx = 1.0;
+ sy = 1.0;
+ } else {
+ sx = 1.0 / tex->params.w;
+ sy = 1.0 / tex->params.h;
+ }
+
+ if (out_pos) {
+ pl_rect2df full = {
+ .x1 = tex->params.w,
+ .y1 = tex->params.h,
+ };
+
+ rect = PL_DEF(rect, &full);
+ *out_pos = sh_attr_vec2(sh, "tex_coord", &(pl_rect2df) {
+ .x0 = sx * rect->x0, .y0 = sy * rect->y0,
+ .x1 = sx * rect->x1, .y1 = sy * rect->y1,
+ });
+ }
+
+ if (out_pt) {
+ *out_pt = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tex_pt"),
+ .data = &(float[2]) {sx, sy},
+ });
+ }
+
+ return itex;
+}
+
+bool sh_buf_desc_append(void *alloc, pl_gpu gpu,
+ struct pl_shader_desc *buf_desc,
+ struct pl_var_layout *out_layout,
+ const struct pl_var new_var)
+{
+ struct pl_buffer_var bv = { .var = new_var };
+ size_t cur_size = sh_buf_desc_size(buf_desc);
+
+ switch (buf_desc->desc.type) {
+ case PL_DESC_BUF_UNIFORM:
+ bv.layout = pl_std140_layout(cur_size, &new_var);
+ if (bv.layout.offset + bv.layout.size > gpu->limits.max_ubo_size)
+ return false;
+ break;
+ case PL_DESC_BUF_STORAGE:
+ bv.layout = pl_std430_layout(cur_size, &new_var);
+ if (bv.layout.offset + bv.layout.size > gpu->limits.max_ssbo_size)
+ return false;
+ break;
+ case PL_DESC_INVALID:
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG:
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (out_layout)
+ *out_layout = bv.layout;
+ PL_ARRAY_APPEND_RAW(alloc, buf_desc->buffer_vars, buf_desc->num_buffer_vars, bv);
+ return true;
+}
+
+size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc)
+{
+ if (!buf_desc->num_buffer_vars)
+ return 0;
+
+ const struct pl_buffer_var *last;
+ last = &buf_desc->buffer_vars[buf_desc->num_buffer_vars - 1];
+ return last->layout.offset + last->layout.size;
+}
+
+void sh_describef(pl_shader sh, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ sh_describe(sh, pl_vasprintf(sh->info->tmp, fmt, ap));
+ va_end(ap);
+}
+
+static const char *insigs[] = {
+ [PL_SHADER_SIG_NONE] = "",
+ [PL_SHADER_SIG_COLOR] = "vec4 color",
+};
+
+static const char *outsigs[] = {
+ [PL_SHADER_SIG_NONE] = "void",
+ [PL_SHADER_SIG_COLOR] = "vec4",
+};
+
+static const char *retvals[] = {
+ [PL_SHADER_SIG_NONE] = "",
+ [PL_SHADER_SIG_COLOR] = "return color;",
+};
+
+// libplacebo currently only allows 2D samplers for shader signatures
+static const char *samplers2D[] = {
+ [PL_SAMPLER_NORMAL] = "sampler2D",
+ [PL_SAMPLER_RECT] = "sampler2DRect",
+ [PL_SAMPLER_EXTERNAL] = "samplerExternalOES",
+};
+
+ident_t sh_subpass(pl_shader sh, pl_shader sub)
+{
+ pl_assert(sh->mutable);
+
+ if (sh->prefix == sub->prefix) {
+ PL_TRACE(sh, "Can't merge shaders: conflicting identifiers!");
+ return NULL_IDENT;
+ }
+
+ // Check for shader compatibility
+ int res_w = PL_DEF(sh->output_w, sub->output_w),
+ res_h = PL_DEF(sh->output_h, sub->output_h);
+
+ if ((sub->output_w && res_w != sub->output_w) ||
+ (sub->output_h && res_h != sub->output_h))
+ {
+ PL_TRACE(sh, "Can't merge shaders: incompatible sizes: %dx%d and %dx%d",
+ sh->output_w, sh->output_h, sub->output_w, sub->output_h);
+ return NULL_IDENT;
+ }
+
+ if (sub->type == SH_COMPUTE) {
+ int subw = sub->group_size[0],
+ subh = sub->group_size[1];
+ bool flex = sub->flexible_work_groups;
+
+ if (!sh_try_compute(sh, subw, subh, flex, sub->shmem)) {
+ PL_TRACE(sh, "Can't merge shaders: incompatible block sizes or "
+ "exceeded shared memory resource capabilities");
+ return NULL_IDENT;
+ }
+ }
+
+ sh->output_w = res_w;
+ sh->output_h = res_h;
+
+ // Append the prelude and header
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sub->buffers[SH_BUF_PRELUDE]);
+ pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_HEADER]);
+
+ // Append the body as a new header function
+ if (sub->input == PL_SHADER_SIG_SAMPLER) {
+ pl_assert(sub->sampler_prefix);
+ GLSLH("%s "$"(%c%s src_tex, vec2 tex_coord) {\n",
+ outsigs[sub->output], sub->name,
+ sub->sampler_prefix, samplers2D[sub->sampler_type]);
+ } else {
+ GLSLH("%s "$"(%s) {\n",
+ outsigs[sub->output], sub->name, insigs[sub->input]);
+ }
+ pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_BODY]);
+ GLSLH("%s\n}\n\n", retvals[sub->output]);
+
+ // Steal all inputs and objects from the subpass
+#define ARRAY_STEAL(arr) do \
+{ \
+ PL_ARRAY_CONCAT(sh, sh->arr, sub->arr); \
+ sub->arr.num = 0; \
+} while (0)
+
+ ARRAY_STEAL(obj);
+ ARRAY_STEAL(vas);
+ ARRAY_STEAL(vars);
+ ARRAY_STEAL(descs);
+ ARRAY_STEAL(consts);
+#undef ARRAY_STEAL
+
+ // Steal the scratch buffer (if it holds data)
+ if (sub->data.len) {
+ pl_steal(sh->tmp, sub->data.buf);
+ sub->data = (pl_str) {0};
+ }
+
+ // Steal all temporary allocations and mark the child as unusable
+ pl_steal(sh->tmp, sub->tmp);
+ sub->tmp = pl_tmp(sub);
+ sub->failed = true;
+
+ // Steal the shader steps array (and allocations)
+ pl_assert(pl_rc_count(&sub->info->rc) == 1);
+ PL_ARRAY_CONCAT(sh->info, sh->info->steps, sub->info->steps);
+ pl_steal(sh->info->tmp, sub->info->tmp);
+ sub->info->tmp = pl_tmp(sub->info);
+ sub->info->steps.num = 0; // sanity
+
+ return sub->name;
+}
+
+pl_str_builder sh_finalize_internal(pl_shader sh)
+{
+ pl_assert(sh->mutable); // this function should only ever be called once
+ if (sh->failed)
+ return NULL;
+
+ // Padding for readability
+ GLSLP("\n");
+
+ // Concatenate everything onto the prelude to form the final output
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_HEADER]);
+
+ if (sh->input == PL_SHADER_SIG_SAMPLER) {
+ pl_assert(sh->sampler_prefix);
+ GLSLP("%s "$"(%c%s src_tex, vec2 tex_coord) {\n",
+ outsigs[sh->output], sh->name,
+ sh->sampler_prefix,
+ samplers2D[sh->sampler_type]);
+ } else {
+ GLSLP("%s "$"(%s) {\n", outsigs[sh->output], sh->name, insigs[sh->input]);
+ }
+
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_BODY]);
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_FOOTER]);
+ GLSLP("%s\n}\n\n", retvals[sh->output]);
+
+ // Generate the shader info
+ struct sh_info *info = sh->info;
+ info->info.steps = info->steps.elem;
+ info->info.num_steps = info->steps.num;
+ info->info.description = "(unknown shader)";
+
+ // Generate pretty description
+ for (int i = 0; i < info->steps.num; i++) {
+ const char *step = info->steps.elem[i];
+
+ // Prevent duplicates. We're okay using a weak equality check here
+ // because most pass descriptions are static strings.
+ for (int j = 0; j < i; j++) {
+ if (info->steps.elem[j] == step)
+ goto next_step;
+ }
+
+ int count = 1;
+ for (int j = i+1; j < info->steps.num; j++) {
+ if (info->steps.elem[j] == step)
+ count++;
+ }
+
+ const char *prefix = i > 0 ? ", " : "";
+ if (count > 1) {
+ pl_str_append_asprintf(info, &info->desc, "%s%s x%d",
+ prefix, step, count);
+ } else {
+ pl_str_append_asprintf(info, &info->desc, "%s%s", prefix, step);
+ }
+
+next_step: ;
+ }
+
+ if (info->desc.len)
+ info->info.description = (char *) info->desc.buf;
+
+ sh->mutable = false;
+ return sh->buffers[SH_BUF_PRELUDE];
+}
+
+const struct pl_shader_res *pl_shader_finalize(pl_shader sh)
+{
+ if (sh->failed) {
+ return NULL;
+ } else if (!sh->mutable) {
+ return &sh->result;
+ }
+
+ pl_shader_info info = &sh->info->info;
+ pl_str_builder glsl = sh_finalize_internal(sh);
+
+ // Turn ident_t into friendly strings before passing it to users
+#define FIX_IDENT(name) \
+ name = sh_ident_tostr(sh_ident_unpack(name))
+ for (int i = 0; i < sh->vas.num; i++)
+ FIX_IDENT(sh->vas.elem[i].attr.name);
+ for (int i = 0; i < sh->vars.num; i++)
+ FIX_IDENT(sh->vars.elem[i].var.name);
+ for (int i = 0; i < sh->consts.num; i++)
+ FIX_IDENT(sh->consts.elem[i].name);
+ for (int i = 0; i < sh->descs.num; i++) {
+ struct pl_shader_desc *sd = &sh->descs.elem[i];
+ FIX_IDENT(sd->desc.name);
+ for (int j = 0; j < sd->num_buffer_vars; sd++)
+ FIX_IDENT(sd->buffer_vars[j].var.name);
+ }
+#undef FIX_IDENT
+
+ sh->result = (struct pl_shader_res) {
+ .info = info,
+ .glsl = (char *) pl_str_builder_exec(glsl).buf,
+ .name = sh_ident_tostr(sh->name),
+ .input = sh->input,
+ .output = sh->output,
+ .compute_group_size = { sh->group_size[0], sh->group_size[1] },
+ .compute_shmem = sh->shmem,
+ .vertex_attribs = sh->vas.elem,
+ .num_vertex_attribs = sh->vas.num,
+ .variables = sh->vars.elem,
+ .num_variables = sh->vars.num,
+ .descriptors = sh->descs.elem,
+ .num_descriptors = sh->descs.num,
+ .constants = sh->consts.elem,
+ .num_constants = sh->consts.num,
+ // deprecated fields
+ .params = info->params,
+ .steps = info->steps,
+ .num_steps = info->num_steps,
+ .description = info->description,
+ };
+
+ return &sh->result;
+}
+
+bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h)
+{
+ if (sh->failed) {
+ SH_FAIL(sh, "Attempting to modify a failed shader!");
+ return false;
+ }
+
+ if (!sh->mutable) {
+ SH_FAIL(sh, "Attempted to modify an immutable shader!");
+ return false;
+ }
+
+ if ((w && sh->output_w && sh->output_w != w) ||
+ (h && sh->output_h && sh->output_h != h))
+ {
+ SH_FAIL(sh, "Illegal sequence of shader operations: Incompatible "
+ "output size requirements %dx%d and %dx%d",
+ sh->output_w, sh->output_h, w, h);
+ return false;
+ }
+
+ static const char *names[] = {
+ [PL_SHADER_SIG_NONE] = "PL_SHADER_SIG_NONE",
+ [PL_SHADER_SIG_COLOR] = "PL_SHADER_SIG_COLOR",
+ };
+
+ // If we require an input, but there is none available - just get it from
+ // the user by turning it into an explicit input signature.
+ if (!sh->output && insig) {
+ pl_assert(!sh->input);
+ sh->input = insig;
+ } else if (sh->output != insig) {
+ SH_FAIL(sh, "Illegal sequence of shader operations! Current output "
+ "signature is '%s', but called operation expects '%s'!",
+ names[sh->output], names[insig]);
+ return false;
+ }
+
+ // All of our shaders end up returning a vec4 color
+ sh->output = PL_SHADER_SIG_COLOR;
+ sh->output_w = PL_DEF(sh->output_w, w);
+ sh->output_h = PL_DEF(sh->output_h, h);
+ return true;
+}
+
+static void sh_obj_deref(pl_shader_obj obj)
+{
+ if (!pl_rc_deref(&obj->rc))
+ return;
+
+ if (obj->uninit)
+ obj->uninit(obj->gpu, obj->priv);
+
+ pl_free(obj);
+}
+
+void pl_shader_obj_destroy(pl_shader_obj *ptr)
+{
+ pl_shader_obj obj = *ptr;
+ if (!obj)
+ return;
+
+ sh_obj_deref(obj);
+ *ptr = NULL;
+}
+
+void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr,
+ enum pl_shader_obj_type type, size_t priv_size,
+ void (*uninit)(pl_gpu gpu, void *priv))
+{
+ if (!ptr)
+ return NULL;
+
+ pl_shader_obj obj = *ptr;
+ if (obj && obj->gpu != SH_GPU(sh)) {
+ SH_FAIL(sh, "Passed pl_shader_obj belongs to different GPU!");
+ return NULL;
+ }
+
+ if (obj && obj->type != type) {
+ SH_FAIL(sh, "Passed pl_shader_obj of wrong type! Shader objects must "
+ "always be used with the same type of shader.");
+ return NULL;
+ }
+
+ if (!obj) {
+ obj = pl_zalloc_ptr(NULL, obj);
+ pl_rc_init(&obj->rc);
+ obj->gpu = SH_GPU(sh);
+ obj->type = type;
+ obj->priv = pl_zalloc(obj, priv_size);
+ obj->uninit = uninit;
+ }
+
+ PL_ARRAY_APPEND(sh, sh->obj, obj);
+ pl_rc_ref(&obj->rc);
+
+ *ptr = obj;
+ return obj->priv;
+}
+
+ident_t sh_prng(pl_shader sh, bool temporal, ident_t *p_state)
+{
+ ident_t randfun = sh_fresh(sh, "rand"),
+ state = sh_fresh(sh, "state");
+
+ // Based on pcg3d (http://jcgt.org/published/0009/03/02/)
+ GLSLP("#define prng_t uvec3\n");
+ GLSLH("vec3 "$"(inout uvec3 s) { \n"
+ " s = 1664525u * s + uvec3(1013904223u); \n"
+ " s.x += s.y * s.z; \n"
+ " s.y += s.z * s.x; \n"
+ " s.z += s.x * s.y; \n"
+ " s ^= s >> 16u; \n"
+ " s.x += s.y * s.z; \n"
+ " s.y += s.z * s.x; \n"
+ " s.z += s.x * s.y; \n"
+ " return vec3(s) * 1.0/float(0xFFFFFFFFu); \n"
+ "} \n",
+ randfun);
+
+ if (temporal) {
+ GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, "$"); \n",
+ state, SH_UINT_DYN(SH_PARAMS(sh).index));
+ } else {
+ GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, 0.0); \n", state);
+ }
+
+ if (p_state)
+ *p_state = state;
+
+ ident_t res = sh_fresh(sh, "RAND");
+ GLSLH("#define "$" ("$"("$"))\n", res, randfun, state);
+ return res;
+}
diff --git a/src/shaders.h b/src/shaders.h
new file mode 100644
index 0000000..7656a35
--- /dev/null
+++ b/src/shaders.h
@@ -0,0 +1,387 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <limits.h>
+
+#include "common.h"
+#include "cache.h"
+#include "log.h"
+#include "gpu.h"
+
+#include <libplacebo/shaders.h>
+
+// This represents an identifier (e.g. name of function, uniform etc.) for
+// a shader resource. Not human-readable.
+
+typedef unsigned short ident_t;
+#define $ "_%hx"
+#define NULL_IDENT 0u
+
+#define sh_mkident(id, name) ((ident_t) id)
+#define sh_ident_tostr(id) pl_asprintf(sh->tmp, $, id)
+
+enum {
+ IDENT_BITS = 8 * sizeof(ident_t),
+ IDENT_MASK = (uintptr_t) USHRT_MAX,
+ IDENT_SENTINEL = (uintptr_t) 0x20230319 << IDENT_BITS,
+};
+
+// Functions to pack/unpack an identifier into a `const char *` name field.
+// Used to defer string templating of friendly names until actually necessary
+static inline const char *sh_ident_pack(ident_t id)
+{
+ return (const char *)(uintptr_t) (IDENT_SENTINEL | id);
+}
+
+static inline ident_t sh_ident_unpack(const char *name)
+{
+ uintptr_t uname = (uintptr_t) name;
+ assert((uname & ~IDENT_MASK) == IDENT_SENTINEL);
+ return uname & IDENT_MASK;
+}
+
+enum pl_shader_buf {
+ SH_BUF_PRELUDE, // extra #defines etc.
+ SH_BUF_HEADER, // previous passes, helper function definitions, etc.
+ SH_BUF_BODY, // partial contents of the "current" function
+ SH_BUF_FOOTER, // will be appended to the end of the current function
+ SH_BUF_COUNT,
+};
+
+enum pl_shader_type {
+ SH_AUTO,
+ SH_COMPUTE,
+ SH_FRAGMENT
+};
+
+struct sh_info {
+ // public-facing struct
+ struct pl_shader_info_t info;
+
+ // internal fields
+ void *tmp;
+ pl_rc_t rc;
+ pl_str desc;
+ PL_ARRAY(const char *) steps;
+};
+
+struct pl_shader_t {
+ pl_log log;
+ void *tmp; // temporary allocations (freed on pl_shader_reset)
+ struct sh_info *info;
+ pl_str data; // pooled/recycled scratch buffer for small allocations
+ PL_ARRAY(pl_shader_obj) obj;
+ bool failed;
+ bool mutable;
+ ident_t name;
+ enum pl_shader_sig input, output;
+ int output_w;
+ int output_h;
+ bool transpose;
+ pl_str_builder buffers[SH_BUF_COUNT];
+ enum pl_shader_type type;
+ bool flexible_work_groups;
+ int group_size[2];
+ size_t shmem;
+ enum pl_sampler_type sampler_type;
+ char sampler_prefix;
+ unsigned short prefix; // pre-processed version of res.params.id
+ unsigned short fresh;
+
+ // Note: internally, these `pl_shader_va` etc. use raw ident_t fields
+ // instead of `const char *` wherever a name is required! These are
+ // translated to legal strings either in `pl_shader_finalize`, or inside
+ // the `pl_dispatch` shader compilation step.
+ PL_ARRAY(struct pl_shader_va) vas;
+ PL_ARRAY(struct pl_shader_var) vars;
+ PL_ARRAY(struct pl_shader_desc) descs;
+ PL_ARRAY(struct pl_shader_const) consts;
+
+ // cached result of `pl_shader_finalize`
+ struct pl_shader_res result;
+};
+
+// Free temporary resources associated with a shader. Normally called by
+// pl_shader_reset(), but used internally to reduce memory waste.
+void sh_deref(pl_shader sh);
+
+// Same as `pl_shader_finalize` but doesn't generate `sh->res`, instead returns
+// the string builder to be used to finalize the shader. Assumes the caller
+// will access the shader's internal fields directly.
+pl_str_builder sh_finalize_internal(pl_shader sh);
+
+// Helper functions for convenience
+#define SH_PARAMS(sh) ((sh)->info->info.params)
+#define SH_GPU(sh) (SH_PARAMS(sh).gpu)
+#define SH_CACHE(sh) pl_gpu_cache(SH_GPU(sh))
+
+// Returns the GLSL version, defaulting to desktop 130.
+struct pl_glsl_version sh_glsl(const pl_shader sh);
+
+#define SH_FAIL(sh, ...) do { \
+ sh->failed = true; \
+ PL_ERR(sh, __VA_ARGS__); \
+ } while (0)
+
+// Attempt enabling compute shaders for this pass, if possible
+bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem);
+
+// Attempt merging a secondary shader into the current shader. Returns NULL if
+// merging fails (e.g. incompatible signatures); otherwise returns an identifier
+// corresponding to the generated subpass function.
+//
+// If successful, the subpass shader is set to an undefined failure state and
+// must be explicitly reset/aborted before being re-used.
+ident_t sh_subpass(pl_shader sh, pl_shader sub);
+
+// Helpers for adding new variables/descriptors/etc. with fresh, unique
+// identifier names. These will never conflict with other identifiers, even
+// if the shaders are merged together.
+ident_t sh_fresh(pl_shader sh, const char *name);
+
+// Add a new shader var and return its identifier
+ident_t sh_var(pl_shader sh, struct pl_shader_var sv);
+
+// Helper functions for `sh_var`
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic);
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic);
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic);
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val);
+#define SH_INT_DYN(val) sh_var_int(sh, "const", val, true)
+#define SH_UINT_DYN(val) sh_var_uint(sh, "const", val, true)
+#define SH_FLOAT_DYN(val) sh_var_float(sh, "const", val, true)
+#define SH_MAT3(val) sh_var_mat3(sh, "mat", val)
+
+// Add a new shader desc and return its identifier.
+ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd);
+
+// Add a new shader constant and return its identifier.
+ident_t sh_const(pl_shader sh, struct pl_shader_const sc);
+
+// Helper functions for `sh_const`
+ident_t sh_const_int(pl_shader sh, const char *name, int val);
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val);
+ident_t sh_const_float(pl_shader sh, const char *name, float val);
+#define SH_INT(val) sh_const_int(sh, "const", val)
+#define SH_UINT(val) sh_const_uint(sh, "const", val)
+#define SH_FLOAT(val) sh_const_float(sh, "const", val)
+
+// Add a new shader va and return its identifier
+ident_t sh_attr(pl_shader sh, struct pl_shader_va sva);
+
+// Helper to add a a vec2 VA from a pl_rect2df. Returns NULL_IDENT on failure.
+ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc);
+
+// Bind a texture under a given transformation and make its attributes
+// available as well. If an output pointer for one of the attributes is left
+// as NULL, that attribute will not be added. Returns NULL on failure. `rect`
+// is optional, and defaults to the full texture if left as NULL.
+//
+// Note that for e.g. compute shaders, the vec2 out_pos might be a macro that
+// expands to an expensive computation, and should be cached by the user.
+ident_t sh_bind(pl_shader sh, pl_tex tex,
+ enum pl_tex_address_mode address_mode,
+ enum pl_tex_sample_mode sample_mode,
+ const char *name, const pl_rect2df *rect,
+ ident_t *out_pos, ident_t *out_pt);
+
+// Incrementally build up a buffer by adding new variable elements to the
+// buffer, resizing buf.buffer_vars if necessary. Returns whether or not the
+// variable could be successfully added (which may fail if you try exceeding
+// the size limits of the buffer type). If successful, the layout is stored
+// in *out_layout (may be NULL).
+bool sh_buf_desc_append(void *alloc, pl_gpu gpu,
+ struct pl_shader_desc *buf_desc,
+ struct pl_var_layout *out_layout,
+ const struct pl_var new_var);
+
+size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc);
+
+
+// Underlying function for appending text to a shader
+#define sh_append(sh, buf, ...) \
+ pl_str_builder_addf((sh)->buffers[buf], __VA_ARGS__)
+
+#define sh_append_str(sh, buf, str) \
+ pl_str_builder_str((sh)->buffers[buf], str)
+
+#define GLSLP(...) sh_append(sh, SH_BUF_PRELUDE, __VA_ARGS__)
+#define GLSLH(...) sh_append(sh, SH_BUF_HEADER, __VA_ARGS__)
+#define GLSL(...) sh_append(sh, SH_BUF_BODY, __VA_ARGS__)
+#define GLSLF(...) sh_append(sh, SH_BUF_FOOTER, __VA_ARGS__)
+
+// Attach a description to a shader
+void sh_describef(pl_shader sh, const char *fmt, ...)
+ PL_PRINTF(2, 3);
+
+static inline void sh_describe(pl_shader sh, const char *desc)
+{
+ PL_ARRAY_APPEND(sh->info, sh->info->steps, desc);
+};
+
+// Requires that the share is mutable, has an output signature compatible
+// with the given input signature, as well as an output size compatible with
+// the given size requirements. Errors and returns false otherwise.
+bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h);
+
+// Shader resources
+
+enum pl_shader_obj_type {
+ PL_SHADER_OBJ_INVALID = 0,
+ PL_SHADER_OBJ_COLOR_MAP,
+ PL_SHADER_OBJ_SAMPLER,
+ PL_SHADER_OBJ_DITHER,
+ PL_SHADER_OBJ_LUT,
+ PL_SHADER_OBJ_AV1_GRAIN,
+ PL_SHADER_OBJ_FILM_GRAIN,
+ PL_SHADER_OBJ_RESHAPE,
+};
+
+struct pl_shader_obj_t {
+ enum pl_shader_obj_type type;
+ pl_rc_t rc;
+ pl_gpu gpu;
+ void (*uninit)(pl_gpu gpu, void *priv);
+ void *priv;
+};
+
+// Returns (*ptr)->priv, or NULL on failure
+void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr,
+ enum pl_shader_obj_type type, size_t priv_size,
+ void (*uninit)(pl_gpu gpu, void *priv));
+
+#define SH_OBJ(sh, ptr, type, t, uninit) \
+ ((t*) sh_require_obj(sh, ptr, type, sizeof(t), uninit))
+
+// Initializes a PRNG. The resulting string will directly evaluate to a
+// pseudorandom, uniformly distributed vec3 from [0.0,1.0]. Since this
+// algorithm works by mutating a state variable, if the user wants to use the
+// resulting PRNG inside a subfunction, they must add an extra `inout prng_t %s`
+// with the contents of `state` to the signature. (Optional)
+//
+// If `temporal` is set, the PRNG will vary across frames.
+ident_t sh_prng(pl_shader sh, bool temporal, ident_t *state);
+
+// Backing memory type
+enum sh_lut_type {
+ SH_LUT_AUTO = 0, // pick whatever makes the most sense
+ SH_LUT_TEXTURE, // upload as texture
+ SH_LUT_UNIFORM, // uniform array
+ SH_LUT_LITERAL, // constant / literal array in shader source (fallback)
+};
+
+// Interpolation method
+enum sh_lut_method {
+ SH_LUT_NONE = 0, // no interpolation, integer indices
+ SH_LUT_LINEAR, // linear interpolation, vecN indices in range [0,1]
+ SH_LUT_CUBIC, // (bi/tri)cubic interpolation
+ SH_LUT_TETRAHEDRAL, // tetrahedral interpolation for vec3, equivalent to
+ // SH_LUT_LINEAR for lower dimensions
+};
+
+struct sh_lut_params {
+ pl_shader_obj *object;
+
+ // Type of the LUT we intend to generate.
+ //
+ // Note: If `var_type` is PL_VAR_*INT, `method` must be SH_LUT_NONE.
+ enum pl_var_type var_type;
+ enum sh_lut_type lut_type;
+ enum sh_lut_method method;
+
+ // For SH_LUT_TEXTURE, this can be used to override the texture's internal
+ // format, in which case it takes precedence over the default for `type`.
+ pl_fmt fmt;
+
+ // LUT dimensions. Unused dimensions may be left as 0.
+ int width;
+ int height;
+ int depth;
+ int comps;
+
+ // If true, the LUT will always be regenerated, even if the dimensions have
+ // not changed.
+ bool update;
+
+ // Alternate way of triggering shader invalidations. If the signature
+ // does not match the LUT's signature, it will be regenerated.
+ uint64_t signature;
+
+ // If set to true, shader objects will be preserved and updated in-place
+ // rather than being treated as read-only.
+ bool dynamic;
+
+ // If set , generated shader objects are automatically cached in this
+ // cache. Requires `signature` to be set (and uniquely identify the LUT).
+ pl_cache cache;
+
+ // Will be called with a zero-initialized buffer whenever the data needs to
+ // be computed, which happens whenever the size is changed, the shader
+ // object is invalidated, or `update` is set to true.
+ //
+ // Note: Interpretation of `data` is according to `type` and `fmt`.
+ void (*fill)(void *data, const struct sh_lut_params *params);
+ void *priv;
+
+ // Debug tag to track LUT source
+ pl_debug_tag debug_tag;
+};
+
+#define sh_lut_params(...) (&(struct sh_lut_params) { \
+ .debug_tag = PL_DEBUG_TAG, \
+ __VA_ARGS__ \
+ })
+
+// Makes a table of values available as a shader variable, using an a given
+// method (falling back if needed). The resulting identifier can be sampled
+// directly as %s(pos), where pos is a vector with the right number of
+// dimensions. `pos` must be an integer vector within the bounds of the array,
+// unless the method is `SH_LUT_LINEAR`, in which case it's a float vector that
+// gets interpolated and clamped as needed. Returns NULL on error.
+ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params);
+
+static inline uint8_t sh_num_comps(uint8_t mask)
+{
+ pl_assert((mask & 0xF) == mask);
+ return __builtin_popcount(mask);
+}
+
+static inline const char *sh_float_type(uint8_t mask)
+{
+ switch (sh_num_comps(mask)) {
+ case 1: return "float";
+ case 2: return "vec2";
+ case 3: return "vec3";
+ case 4: return "vec4";
+ }
+
+ pl_unreachable();
+}
+
+static inline const char *sh_swizzle(uint8_t mask)
+{
+ static const char * const swizzles[0x10] = {
+ NULL, "r", "g", "rg", "b", "rb", "gb", "rgb",
+ "a", "ra", "ga", "rga", "ba", "rba", "gba", "rgba",
+ };
+
+ pl_assert(mask <= PL_ARRAY_SIZE(swizzles));
+ return swizzles[mask];
+}
diff --git a/src/shaders/colorspace.c b/src/shaders/colorspace.c
new file mode 100644
index 0000000..c7b3b5a
--- /dev/null
+++ b/src/shaders/colorspace.c
@@ -0,0 +1,2120 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "cache.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+
+// Common constants for SMPTE ST.2084 (PQ)
+static const float PQ_M1 = 2610./4096 * 1./4,
+ PQ_M2 = 2523./4096 * 128,
+ PQ_C1 = 3424./4096,
+ PQ_C2 = 2413./4096 * 32,
+ PQ_C3 = 2392./4096 * 32;
+
+// Common constants for ARIB STD-B67 (HLG)
+static const float HLG_A = 0.17883277,
+ HLG_B = 0.28466892,
+ HLG_C = 0.55991073,
+ HLG_REF = 1000.0 / PL_COLOR_SDR_WHITE;
+
+// Common constants for Panasonic V-Log
+static const float VLOG_B = 0.00873,
+ VLOG_C = 0.241514,
+ VLOG_D = 0.598206;
+
+// Common constants for Sony S-Log
+static const float SLOG_A = 0.432699,
+ SLOG_B = 0.037584,
+ SLOG_C = 0.616596 + 0.03,
+ SLOG_P = 3.538813,
+ SLOG_Q = 0.030001,
+ SLOG_K2 = 155.0 / 219.0;
+
+void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr,
+ enum pl_alpha_mode mode)
+{
+ if (repr->alpha == PL_ALPHA_PREMULTIPLIED && mode == PL_ALPHA_INDEPENDENT) {
+ GLSL("if (color.a > 1e-6) \n"
+ " color.rgb /= vec3(color.a); \n");
+ repr->alpha = PL_ALPHA_INDEPENDENT;
+ }
+
+ if (repr->alpha == PL_ALPHA_INDEPENDENT && mode == PL_ALPHA_PREMULTIPLIED) {
+ GLSL("color.rgb *= vec3(color.a); \n");
+ repr->alpha = PL_ALPHA_PREMULTIPLIED;
+ }
+}
+
+#ifdef PL_HAVE_DOVI
+static inline void reshape_mmr(pl_shader sh, ident_t mmr, bool single,
+ int min_order, int max_order)
+{
+ if (single) {
+ GLSL("const uint mmr_idx = 0u; \n");
+ } else {
+ GLSL("uint mmr_idx = uint(coeffs.y); \n");
+ }
+
+ assert(min_order <= max_order);
+ if (min_order < max_order)
+ GLSL("uint order = uint(coeffs.w); \n");
+
+ GLSL("vec4 sigX; \n"
+ "s = coeffs.x; \n"
+ "sigX.xyz = sig.xxy * sig.yzz; \n"
+ "sigX.w = sigX.x * sig.z; \n"
+ "s += dot("$"[mmr_idx + 0].xyz, sig); \n"
+ "s += dot("$"[mmr_idx + 1], sigX); \n",
+ mmr, mmr);
+
+ if (max_order >= 2) {
+ if (min_order < 2)
+ GLSL("if (order >= 2) { \n");
+
+ GLSL("vec3 sig2 = sig * sig; \n"
+ "vec4 sigX2 = sigX * sigX; \n"
+ "s += dot("$"[mmr_idx + 2].xyz, sig2); \n"
+ "s += dot("$"[mmr_idx + 3], sigX2); \n",
+ mmr, mmr);
+
+ if (max_order == 3) {
+ if (min_order < 3)
+ GLSL("if (order >= 3 { \n");
+
+ GLSL("s += dot("$"[mmr_idx + 4].xyz, sig2 * sig); \n"
+ "s += dot("$"[mmr_idx + 5], sigX2 * sigX); \n",
+ mmr, mmr);
+
+ if (min_order < 3)
+ GLSL("} \n");
+ }
+
+ if (min_order < 2)
+ GLSL("} \n");
+ }
+}
+
+static inline void reshape_poly(pl_shader sh)
+{
+ GLSL("s = (coeffs.z * s + coeffs.y) * s + coeffs.x; \n");
+}
+#endif
+
+void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data)
+{
+#ifdef PL_HAVE_DOVI
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0) || !data)
+ return;
+
+ sh_describe(sh, "reshaping");
+ GLSL("// pl_shader_reshape \n"
+ "{ \n"
+ "vec3 sig; \n"
+ "vec4 coeffs; \n"
+ "float s; \n"
+ "sig = clamp(color.rgb, 0.0, 1.0); \n");
+
+ float coeffs_data[8][4];
+ float mmr_packed_data[8*6][4];
+
+ for (int c = 0; c < 3; c++) {
+ const struct pl_reshape_data *comp = &data->comp[c];
+ if (!comp->num_pivots)
+ continue;
+
+ pl_assert(comp->num_pivots >= 2 && comp->num_pivots <= 9);
+ GLSL("s = sig[%d]; \n", c);
+
+ // Prepare coefficients for GPU
+ bool has_poly = false, has_mmr = false, mmr_single = true;
+ int mmr_idx = 0, min_order = 3, max_order = 1;
+ memset(coeffs_data, 0, sizeof(coeffs_data));
+ for (int i = 0; i < comp->num_pivots - 1; i++) {
+ switch (comp->method[i]) {
+ case 0: // polynomial
+ has_poly = true;
+ coeffs_data[i][3] = 0.0; // order=0 signals polynomial
+ for (int k = 0; k < 3; k++)
+ coeffs_data[i][k] = comp->poly_coeffs[i][k];
+ break;
+
+ case 1:
+ min_order = PL_MIN(min_order, comp->mmr_order[i]);
+ max_order = PL_MAX(max_order, comp->mmr_order[i]);
+ mmr_single = !has_mmr;
+ has_mmr = true;
+ coeffs_data[i][3] = (float) comp->mmr_order[i];
+ coeffs_data[i][0] = comp->mmr_constant[i];
+ coeffs_data[i][1] = (float) mmr_idx;
+ for (int j = 0; j < comp->mmr_order[i]; j++) {
+ // store weights per order as two packed vec4s
+ float *mmr = &mmr_packed_data[mmr_idx][0];
+ mmr[0] = comp->mmr_coeffs[i][j][0];
+ mmr[1] = comp->mmr_coeffs[i][j][1];
+ mmr[2] = comp->mmr_coeffs[i][j][2];
+ mmr[3] = 0.0; // unused
+ mmr[4] = comp->mmr_coeffs[i][j][3];
+ mmr[5] = comp->mmr_coeffs[i][j][4];
+ mmr[6] = comp->mmr_coeffs[i][j][5];
+ mmr[7] = comp->mmr_coeffs[i][j][6];
+ mmr_idx += 2;
+ }
+ break;
+
+ default:
+ pl_unreachable();
+ }
+ }
+
+ if (comp->num_pivots > 2) {
+
+ // Skip the (irrelevant) lower and upper bounds
+ float pivots_data[7];
+ memcpy(pivots_data, comp->pivots + 1,
+ (comp->num_pivots - 2) * sizeof(pivots_data[0]));
+
+ // Fill the remainder with a quasi-infinite sentinel pivot
+ for (int i = comp->num_pivots - 2; i < PL_ARRAY_SIZE(pivots_data); i++)
+ pivots_data[i] = 1e9f;
+
+ ident_t pivots = sh_var(sh, (struct pl_shader_var) {
+ .data = pivots_data,
+ .var = {
+ .name = "pivots",
+ .type = PL_VAR_FLOAT,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = PL_ARRAY_SIZE(pivots_data),
+ },
+ });
+
+ ident_t coeffs = sh_var(sh, (struct pl_shader_var) {
+ .data = coeffs_data,
+ .var = {
+ .name = "coeffs",
+ .type = PL_VAR_FLOAT,
+ .dim_v = 4,
+ .dim_m = 1,
+ .dim_a = PL_ARRAY_SIZE(coeffs_data),
+ },
+ });
+
+ // Efficiently branch into the correct set of coefficients
+ GLSL("#define test(i) bvec4(s >= "$"[i]) \n"
+ "#define coef(i) "$"[i] \n"
+ "coeffs = mix(mix(mix(coef(0), coef(1), test(0)), \n"
+ " mix(coef(2), coef(3), test(2)), \n"
+ " test(1)), \n"
+ " mix(mix(coef(4), coef(5), test(4)), \n"
+ " mix(coef(6), coef(7), test(6)), \n"
+ " test(5)), \n"
+ " test(3)); \n"
+ "#undef test \n"
+ "#undef coef \n",
+ pivots, coeffs);
+
+ } else {
+
+ // No need for a single pivot, just set the coeffs directly
+ GLSL("coeffs = "$"; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec4("coeffs"),
+ .data = coeffs_data,
+ }));
+
+ }
+
+ ident_t mmr = NULL_IDENT;
+ if (has_mmr) {
+ mmr = sh_var(sh, (struct pl_shader_var) {
+ .data = mmr_packed_data,
+ .var = {
+ .name = "mmr",
+ .type = PL_VAR_FLOAT,
+ .dim_v = 4,
+ .dim_m = 1,
+ .dim_a = mmr_idx,
+ },
+ });
+ }
+
+ if (has_mmr && has_poly) {
+ GLSL("if (coeffs.w == 0.0) { \n");
+ reshape_poly(sh);
+ GLSL("} else { \n");
+ reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+ GLSL("} \n");
+ } else if (has_poly) {
+ reshape_poly(sh);
+ } else {
+ assert(has_mmr);
+ GLSL("{ \n");
+ reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+ GLSL("} \n");
+ }
+
+ ident_t lo = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("lo"),
+ .data = &comp->pivots[0],
+ });
+ ident_t hi = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("hi"),
+ .data = &comp->pivots[comp->num_pivots - 1],
+ });
+ GLSL("color[%d] = clamp(s, "$", "$"); \n", c, lo, hi);
+ }
+
+ GLSL("} \n");
+#else
+ SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+#endif
+}
+
+void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr,
+ const struct pl_color_adjustment *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ sh_describe(sh, "color decoding");
+ GLSL("// pl_shader_decode_color \n"
+ "{ \n");
+
+ // Do this first because the following operations are potentially nonlinear
+ pl_shader_set_alpha(sh, repr, PL_ALPHA_INDEPENDENT);
+
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ ||
+ repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+ {
+ ident_t scale = SH_FLOAT(pl_color_repr_normalize(repr));
+ GLSL("color.rgb *= vec3("$"); \n", scale);
+ }
+
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+ pl_shader_linearize(sh, &(struct pl_color_space) {
+ .transfer = PL_COLOR_TRC_ST428,
+ });
+ }
+
+ if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+ pl_shader_dovi_reshape(sh, repr->dovi);
+
+ enum pl_color_system orig_sys = repr->sys;
+ pl_transform3x3 tr = pl_color_repr_decode(repr, params);
+
+ if (memcmp(&tr, &pl_transform3x3_identity, sizeof(tr))) {
+ ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("cmat"),
+ .data = PL_TRANSPOSE_3X3(tr.mat.m),
+ });
+
+ ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec3("cmat_c"),
+ .data = tr.c,
+ });
+
+ GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+ }
+
+ switch (orig_sys) {
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
+ // C'bc = (B'-Y'c) / 1.9404 | C'bc <= 0
+ // = (B'-Y'c) / 1.5816 | C'bc > 0
+ //
+ // C'rc = (R'-Y'c) / 1.7184 | C'rc <= 0
+ // = (R'-Y'c) / 0.9936 | C'rc > 0
+ //
+ // as per the BT.2020 specification, table 4. This is a non-linear
+ // transformation because (constant) luminance receives non-equal
+ // contributions from the three different channels.
+ GLSL("// constant luminance conversion \n"
+ "color.br = color.br * mix(vec2(1.5816, 0.9936), \n"
+ " vec2(1.9404, 1.7184), \n"
+ " lessThanEqual(color.br, vec2(0.0))) \n"
+ " + color.gg; \n");
+ // Expand channels to camera-linear light. This shader currently just
+ // assumes everything uses the BT.2020 12-bit gamma function, since the
+ // difference between 10 and 12-bit is negligible for anything other
+ // than 12-bit content.
+ GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n"
+ " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+ " vec3(1.0/0.45)), \n"
+ " lessThanEqual(vec3(0.08145), color.rgb)); \n");
+ // Calculate the green channel from the expanded RYcB, and recompress to G'
+ // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
+ GLSL("color.g = (lin.g - 0.2627*lin.r - 0.0593*lin.b)*1.0/0.6780; \n"
+ "color.g = mix(color.g * 4.5, \n"
+ " 1.0993 * pow(color.g, 0.45) - 0.0993, \n"
+ " 0.0181 <= color.g); \n");
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_PQ:;
+ // Conversion process from the spec:
+ //
+ // 1. L'M'S' = cmat * ICtCp
+ // 2. LMS = linearize(L'M'S') (EOTF for PQ, inverse OETF for HLG)
+ // 3. RGB = lms2rgb * LMS
+ //
+ // After this we need to invert step 2 to arrive at non-linear RGB.
+ // (It's important we keep the transfer function conversion separate
+ // from the color system decoding, so we have to partially undo our
+ // work here even though we will end up linearizing later on anyway)
+
+ GLSL(// PQ EOTF
+ "color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ // LMS matrix
+ "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n"
+ " -2.50645, 1.98360, -0.0989137, \n"
+ " 0.06984, -0.192271, 1.12486) * color.rgb; \n"
+ // PQ OETF
+ "color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ GLSL(// HLG OETF^-1
+ "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n"
+ " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " + vec3(%f), \n"
+ " lessThan(vec3(0.5), color.rgb)); \n"
+ // LMS matrix
+ "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n"
+ " -2.50645, 1.98360, -0.0989137, \n"
+ " 0.06984, -0.192271, 1.12486) * color.rgb; \n"
+ // HLG OETF
+ "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n"
+ " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n"
+ " lessThan(vec3(1.0), color.rgb)); \n",
+ HLG_C, HLG_A, HLG_B,
+ HLG_A, HLG_B, HLG_C);
+ break;
+
+ case PL_COLOR_SYSTEM_DOLBYVISION:;
+#ifdef PL_HAVE_DOVI
+ // Dolby Vision always outputs BT.2020-referred HPE LMS, so hard-code
+ // the inverse LMS->RGB matrix corresponding to this color space.
+ pl_matrix3x3 dovi_lms2rgb = {{
+ { 3.06441879, -2.16597676, 0.10155818},
+ {-0.65612108, 1.78554118, -0.12943749},
+ { 0.01736321, -0.04725154, 1.03004253},
+ }};
+
+ pl_matrix3x3_mul(&dovi_lms2rgb, &repr->dovi->linear);
+ ident_t mat = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("lms2rgb"),
+ .data = PL_TRANSPOSE_3X3(dovi_lms2rgb.m),
+ });
+
+ // PQ EOTF
+ GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1);
+ // LMS matrix
+ GLSL("color.rgb = "$" * color.rgb; \n", mat);
+ // PQ OETF
+ GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ break;
+#else
+ SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+ return;
+#endif
+
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ case PL_COLOR_SYSTEM_XYZ:
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_YCGCO:
+ break; // no special post-processing needed
+
+ case PL_COLOR_SYSTEM_COUNT:
+ pl_unreachable();
+ }
+
+ // Gamma adjustment. Doing this here (in non-linear light) is technically
+ // somewhat wrong, but this is just an aesthetic parameter and not really
+ // meant for colorimetric precision, so we don't care too much.
+ if (params && params->gamma == 0) {
+ // Avoid division by zero
+ GLSL("color.rgb = vec3(0.0); \n");
+ } else if (params && params->gamma != 1) {
+ ident_t gamma = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("gamma"),
+ .data = &(float){ 1 / params->gamma },
+ });
+ GLSL("color.rgb = pow(max(color.rgb, vec3(0.0)), vec3("$")); \n", gamma);
+ }
+
+ GLSL("}\n");
+}
+
+void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ sh_describe(sh, "color encoding");
+ GLSL("// pl_shader_encode_color \n"
+ "{ \n");
+
+ switch (repr->sys) {
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ // Expand R'G'B' to RGB
+ GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n"
+ " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+ " vec3(1.0/0.45)), \n"
+ " lessThanEqual(vec3(0.08145), color.rgb)); \n");
+
+ // Compute Yc from RGB and compress to R'Y'cB'
+ GLSL("color.g = dot(vec3(0.2627, 0.6780, 0.0593), lin); \n"
+ "color.g = mix(color.g * 4.5, \n"
+ " 1.0993 * pow(color.g, 0.45) - 0.0993, \n"
+ " 0.0181 <= color.g); \n");
+
+ // Compute C'bc and C'rc into color.br
+ GLSL("color.br = color.br - color.gg; \n"
+ "color.br *= mix(vec2(1.0/1.5816, 1.0/0.9936), \n"
+ " vec2(1.0/1.9404, 1.0/1.7184), \n"
+ " lessThanEqual(color.br, vec2(0.0))); \n");
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_PQ:;
+ GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n"
+ " 0.523925, 0.720459, 0.075440, \n"
+ " 0.063965, 0.112793, 0.900394) * color.rgb; \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ GLSL("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n"
+ " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " + vec3(%f), \n"
+ " lessThan(vec3(0.5), color.rgb)); \n"
+ "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n"
+ " 0.523925, 0.720459, 0.075440, \n"
+ " 0.063965, 0.112793, 0.900394) * color.rgb; \n"
+ "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n"
+ " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n"
+ " lessThan(vec3(1.0), color.rgb)); \n",
+ HLG_C, HLG_A, HLG_B,
+ HLG_A, HLG_B, HLG_C);
+ break;
+
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ SH_FAIL(sh, "Cannot un-apply dolbyvision yet (no inverse reshaping)!");
+ return;
+
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ case PL_COLOR_SYSTEM_XYZ:
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_YCGCO:
+ break; // no special pre-processing needed
+
+ case PL_COLOR_SYSTEM_COUNT:
+ pl_unreachable();
+ }
+
+ // Since this is a relatively rare operation, bypass it as much as possible
+ bool skip = true;
+ skip &= PL_DEF(repr->sys, PL_COLOR_SYSTEM_RGB) == PL_COLOR_SYSTEM_RGB;
+ skip &= PL_DEF(repr->levels, PL_COLOR_LEVELS_FULL) == PL_COLOR_LEVELS_FULL;
+ skip &= !repr->bits.sample_depth || !repr->bits.color_depth ||
+ repr->bits.sample_depth == repr->bits.color_depth;
+ skip &= !repr->bits.bit_shift;
+
+ if (!skip) {
+ struct pl_color_repr copy = *repr;
+ ident_t xyzscale = NULL_IDENT;
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ)
+ xyzscale = SH_FLOAT(1.0 / pl_color_repr_normalize(&copy));
+
+ pl_transform3x3 tr = pl_color_repr_decode(&copy, NULL);
+ pl_transform3x3_invert(&tr);
+
+ ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("cmat"),
+ .data = PL_TRANSPOSE_3X3(tr.mat.m),
+ });
+
+ ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec3("cmat_c"),
+ .data = tr.c,
+ });
+
+ GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+ pl_shader_delinearize(sh, &(struct pl_color_space) {
+ .transfer = PL_COLOR_TRC_ST428,
+ });
+ GLSL("color.rgb *= vec3("$"); \n", xyzscale);
+ }
+ }
+
+ if (repr->alpha == PL_ALPHA_PREMULTIPLIED)
+ GLSL("color.rgb *= vec3(color.a); \n");
+
+ GLSL("}\n");
+}
+
+static ident_t sh_luma_coeffs(pl_shader sh, const struct pl_color_space *csp)
+{
+ pl_matrix3x3 rgb2xyz;
+ rgb2xyz = pl_get_rgb2xyz_matrix(pl_raw_primaries_get(csp->primaries));
+
+ // FIXME: Cannot use `const vec3` due to glslang bug #2025
+ ident_t coeffs = sh_fresh(sh, "luma_coeffs");
+ GLSLH("#define "$" vec3("$", "$", "$") \n", coeffs,
+ SH_FLOAT(rgb2xyz.m[1][0]), // RGB->Y vector
+ SH_FLOAT(rgb2xyz.m[1][1]),
+ SH_FLOAT(rgb2xyz.m[1][2]));
+ return coeffs;
+}
+
+void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ if (csp->transfer == PL_COLOR_TRC_LINEAR)
+ return;
+
+ float csp_min, csp_max;
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = csp,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_NORM,
+ .out_min = &csp_min,
+ .out_max = &csp_max,
+ ));
+
+ // Note that this clamp may technically violate the definition of
+ // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
+ // displayed on the display where such would be possible. That said, the
+ // problem is that not all gamma curves are well-defined on the values
+ // outside this range, so we ignore it and just clamp anyway for sanity.
+ GLSL("// pl_shader_linearize \n"
+ "color.rgb = max(color.rgb, 0.0); \n");
+
+ switch (csp->transfer) {
+ case PL_COLOR_TRC_SRGB:
+ GLSL("color.rgb = mix(color.rgb * vec3(1.0/12.92), \n"
+ " pow((color.rgb + vec3(0.055))/vec3(1.055), \n"
+ " vec3(2.4)), \n"
+ " lessThan(vec3(0.04045), color.rgb)); \n");
+ goto scale_out;
+ case PL_COLOR_TRC_BT_1886: {
+ const float lb = powf(csp_min, 1/2.4f);
+ const float lw = powf(csp_max, 1/2.4f);
+ const float a = powf(lw - lb, 2.4f);
+ const float b = lb / (lw - lb);
+ GLSL("color.rgb = "$" * pow(color.rgb + vec3("$"), vec3(2.4)); \n",
+ SH_FLOAT(a), SH_FLOAT(b));
+ return;
+ }
+ case PL_COLOR_TRC_GAMMA18:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.8));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA20:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.0));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_GAMMA22:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.2));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA24:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.4));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA26:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.6));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA28:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.8));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_PRO_PHOTO:
+ GLSL("color.rgb = mix(color.rgb * vec3(1.0/16.0), \n"
+ " pow(color.rgb, vec3(1.8)), \n"
+ " lessThan(vec3(0.03125), color.rgb)); \n");
+ goto scale_out;
+ case PL_COLOR_TRC_ST428:
+ GLSL("color.rgb = vec3(52.37/48.0) * pow(color.rgb, vec3(2.6));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_PQ:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ // PQ's output range is 0-10000, but we need it to be relative to
+ // to PL_COLOR_SDR_WHITE instead, so rescale
+ "color.rgb *= vec3(%f); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, 10000.0 / PL_COLOR_SDR_WHITE);
+ return;
+ case PL_COLOR_TRC_HLG: {
+ const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+ const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+ // OETF^-1
+ GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n"
+ "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n"
+ " exp((color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
+ " + vec3(%f), \n"
+ " lessThan(vec3(0.5), color.rgb)); \n",
+ SH_FLOAT(1 - b), SH_FLOAT(b),
+ HLG_C, HLG_A, HLG_B);
+ // OOTF
+ GLSL("color.rgb *= 1.0 / 12.0; \n"
+ "color.rgb *= "$" * pow(max(dot("$", color.rgb), 0.0), "$"); \n",
+ SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT(y - 1));
+ return;
+ }
+ case PL_COLOR_TRC_V_LOG:
+ GLSL("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
+ " pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " - vec3(%f), \n"
+ " lessThanEqual(vec3(0.181), color.rgb)); \n",
+ VLOG_D, VLOG_C, VLOG_B);
+ return;
+ case PL_COLOR_TRC_S_LOG1:
+ GLSL("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " - vec3(%f); \n",
+ SLOG_C, SLOG_A, SLOG_B);
+ return;
+ case PL_COLOR_TRC_S_LOG2:
+ GLSL("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f), \n"
+ " (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " - vec3(%f)) * vec3(1.0/%f), \n"
+ " lessThanEqual(vec3(%f), color.rgb)); \n",
+ SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
+ return;
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+
+scale_out:
+ if (csp_max != 1 || csp_min != 0) {
+ GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+ SH_FLOAT(csp_max - csp_min), SH_FLOAT(csp_min));
+ }
+}
+
+void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ if (csp->transfer == PL_COLOR_TRC_LINEAR)
+ return;
+
+ float csp_min, csp_max;
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = csp,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_NORM,
+ .out_min = &csp_min,
+ .out_max = &csp_max,
+ ));
+
+ GLSL("// pl_shader_delinearize \n");
+ switch (csp->transfer) {
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_SRGB:
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_GAMMA18:
+ case PL_COLOR_TRC_GAMMA20:
+ case PL_COLOR_TRC_GAMMA22:
+ case PL_COLOR_TRC_GAMMA24:
+ case PL_COLOR_TRC_GAMMA26:
+ case PL_COLOR_TRC_GAMMA28:
+ case PL_COLOR_TRC_PRO_PHOTO:
+ case PL_COLOR_TRC_ST428: ;
+ if (csp_max != 1 || csp_min != 0) {
+ GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+ SH_FLOAT(1 / (csp_max - csp_min)),
+ SH_FLOAT(-csp_min / (csp_max - csp_min)));
+ }
+ break;
+ case PL_COLOR_TRC_BT_1886:
+ case PL_COLOR_TRC_PQ:
+ case PL_COLOR_TRC_HLG:
+ case PL_COLOR_TRC_V_LOG:
+ case PL_COLOR_TRC_S_LOG1:
+ case PL_COLOR_TRC_S_LOG2:
+ break; // scene-referred or absolute scale
+ case PL_COLOR_TRC_COUNT:
+ pl_unreachable();
+ }
+
+ GLSL("color.rgb = max(color.rgb, 0.0); \n");
+
+ switch (csp->transfer) {
+ case PL_COLOR_TRC_SRGB:
+ GLSL("color.rgb = mix(color.rgb * vec3(12.92), \n"
+ " vec3(1.055) * pow(color.rgb, vec3(1.0/2.4)) \n"
+ " - vec3(0.055), \n"
+ " lessThanEqual(vec3(0.0031308), color.rgb)); \n");
+ return;
+ case PL_COLOR_TRC_BT_1886: {
+ const float lb = powf(csp_min, 1/2.4f);
+ const float lw = powf(csp_max, 1/2.4f);
+ const float a = powf(lw - lb, 2.4f);
+ const float b = lb / (lw - lb);
+ GLSL("color.rgb = pow("$" * color.rgb, vec3(1.0/2.4)) - vec3("$"); \n",
+ SH_FLOAT(1.0 / a), SH_FLOAT(b));
+ return;
+ }
+ case PL_COLOR_TRC_GAMMA18:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/1.8));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA20:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.0));\n");
+ return;
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_GAMMA22:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.2));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA24:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.4));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA26:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.6));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA28:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.8));\n");
+ return;
+ case PL_COLOR_TRC_ST428:
+ GLSL("color.rgb = pow(color.rgb * vec3(48.0/52.37), vec3(1.0/2.6));\n");
+ return;
+ case PL_COLOR_TRC_PRO_PHOTO:
+ GLSL("color.rgb = mix(color.rgb * vec3(16.0), \n"
+ " pow(color.rgb, vec3(1.0/1.8)), \n"
+ " lessThanEqual(vec3(0.001953), color.rgb)); \n");
+ return;
+ case PL_COLOR_TRC_PQ:
+ GLSL("color.rgb *= vec3(1.0/%f); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ 10000 / PL_COLOR_SDR_WHITE, PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ return;
+ case PL_COLOR_TRC_HLG: {
+ const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+ const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+ // OOTF^-1
+ GLSL("color.rgb *= 1.0 / "$"; \n"
+ "color.rgb *= 12.0 * max(1e-6, pow(dot("$", color.rgb), "$")); \n",
+ SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT((1 - y) / y));
+ // OETF
+ GLSL("color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n"
+ " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n"
+ " lessThan(vec3(1.0), color.rgb)); \n"
+ "color.rgb = "$" * color.rgb + vec3("$"); \n",
+ HLG_A, HLG_B, HLG_C,
+ SH_FLOAT(1 / (1 - b)), SH_FLOAT(-b / (1 - b)));
+ return;
+ }
+ case PL_COLOR_TRC_V_LOG:
+ GLSL("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125), \n"
+ " vec3(%f) * log(color.rgb + vec3(%f)) \n"
+ " + vec3(%f), \n"
+ " lessThanEqual(vec3(0.01), color.rgb)); \n",
+ VLOG_C / M_LN10, VLOG_B, VLOG_D);
+ return;
+ case PL_COLOR_TRC_S_LOG1:
+ GLSL("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
+ SLOG_A / M_LN10, SLOG_B, SLOG_C);
+ return;
+ case PL_COLOR_TRC_S_LOG2:
+ GLSL("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f), \n"
+ " vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
+ " + vec3(%f), \n"
+ " lessThanEqual(vec3(0.0), color.rgb)); \n",
+ SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
+ return;
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+const struct pl_sigmoid_params pl_sigmoid_default_params = { PL_SIGMOID_DEFAULTS };
+
+void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ params = PL_DEF(params, &pl_sigmoid_default_params);
+ float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+ float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+
+ // This function needs to go through (0,0) and (1,1), so we compute the
+ // values at 1 and 0, and then scale/shift them, respectively.
+ float offset = 1.0 / (1 + expf(slope * center));
+ float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+ GLSL("// pl_shader_sigmoidize \n"
+ "color = clamp(color, 0.0, 1.0); \n"
+ "color = vec4("$") - vec4("$") * \n"
+ " log(vec4(1.0) / (color * vec4("$") + vec4("$")) \n"
+ " - vec4(1.0)); \n",
+ SH_FLOAT(center), SH_FLOAT(1.0 / slope),
+ SH_FLOAT(scale), SH_FLOAT(offset));
+}
+
+void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ // See: pl_shader_sigmoidize
+ params = PL_DEF(params, &pl_sigmoid_default_params);
+ float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+ float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+ float offset = 1.0 / (1 + expf(slope * center));
+ float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+ GLSL("// pl_shader_unsigmoidize \n"
+ "color = clamp(color, 0.0, 1.0); \n"
+ "color = vec4("$") / \n"
+ " (vec4(1.0) + exp(vec4("$") * (vec4("$") - color))) \n"
+ " - vec4("$"); \n",
+ SH_FLOAT(1.0 / scale),
+ SH_FLOAT(slope), SH_FLOAT(center),
+ SH_FLOAT(offset / scale));
+}
+
+const struct pl_peak_detect_params pl_peak_detect_default_params = { PL_PEAK_DETECT_DEFAULTS };
+const struct pl_peak_detect_params pl_peak_detect_high_quality_params = { PL_PEAK_DETECT_HQ_DEFAULTS };
+
+static bool peak_detect_params_eq(const struct pl_peak_detect_params *a,
+ const struct pl_peak_detect_params *b)
+{
+ return a->smoothing_period == b->smoothing_period &&
+ a->scene_threshold_low == b->scene_threshold_low &&
+ a->scene_threshold_high == b->scene_threshold_high &&
+ a->percentile == b->percentile;
+ // don't compare `allow_delayed` because it doesn't change measurement
+}
+
+enum {
+ // Split the peak buffer into several independent slices to reduce pressure
+ // on global atomics
+ SLICES = 12,
+
+ // How many bits to use for storing PQ data. Be careful when setting this
+ // too high, as it may overflow `unsigned int` on large video sources.
+ //
+ // The value chosen is enough to guarantee no overflow for an 8K x 4K frame
+ // consisting entirely of 100% 10k nits PQ values, with 16x16 workgroups.
+ PQ_BITS = 14,
+ PQ_MAX = (1 << PQ_BITS) - 1,
+
+ // How many bits to use for the histogram. We bias the histogram down
+ // by half the PQ range (~90 nits), effectively clumping the SDR part
+ // of the image into a single histogram bin.
+ HIST_BITS = 7,
+ HIST_BIAS = 1 << (HIST_BITS - 1),
+ HIST_BINS = (1 << HIST_BITS) - HIST_BIAS,
+
+ // Convert from histogram bin to (starting) PQ value
+#define HIST_PQ(bin) (((bin) + HIST_BIAS) << (PQ_BITS - HIST_BITS))
+};
+
+
+pl_static_assert(PQ_BITS >= HIST_BITS);
+
+struct peak_buf_data {
+ unsigned frame_wg_count[SLICES]; // number of work groups processed
+ unsigned frame_wg_active[SLICES];// number of active (nonzero) work groups
+ unsigned frame_sum_pq[SLICES]; // sum of PQ Y values over all WGs (PQ_BITS)
+ unsigned frame_max_pq[SLICES]; // maximum PQ Y value among these WGs (PQ_BITS)
+ unsigned frame_hist[SLICES][HIST_BINS]; // always allocated, conditionally used
+};
+
+static const struct pl_buffer_var peak_buf_vars[] = {
+#define VAR(field) { \
+ .var = { \
+ .name = #field, \
+ .type = PL_VAR_UINT, \
+ .dim_v = 1, \
+ .dim_m = 1, \
+ .dim_a = sizeof(((struct peak_buf_data *) NULL)->field) / \
+ sizeof(unsigned), \
+ }, \
+ .layout = { \
+ .offset = offsetof(struct peak_buf_data, field), \
+ .size = sizeof(((struct peak_buf_data *) NULL)->field), \
+ .stride = sizeof(unsigned), \
+ }, \
+}
+ VAR(frame_wg_count),
+ VAR(frame_wg_active),
+ VAR(frame_sum_pq),
+ VAR(frame_max_pq),
+ VAR(frame_hist),
+#undef VAR
+};
+
+struct sh_color_map_obj {
+ // Tone map state
+ struct {
+ struct pl_tone_map_params params;
+ pl_shader_obj lut;
+ } tone;
+
+ // Gamut map state
+ struct {
+ pl_shader_obj lut;
+ } gamut;
+
+ // Peak detection state
+ struct {
+ struct pl_peak_detect_params params; // currently active parameters
+ pl_buf buf; // pending peak detection buffer
+ pl_buf readback; // readback buffer (fallback)
+ float avg_pq; // current (smoothed) values
+ float max_pq;
+ } peak;
+};
+
+// Excluding size, since this is checked by sh_lut
+static uint64_t gamut_map_signature(const struct pl_gamut_map_params *par)
+{
+ uint64_t sig = CACHE_KEY_GAMUT_LUT;
+ pl_hash_merge(&sig, pl_str0_hash(par->function->name));
+ pl_hash_merge(&sig, pl_var_hash(par->input_gamut));
+ pl_hash_merge(&sig, pl_var_hash(par->output_gamut));
+ pl_hash_merge(&sig, pl_var_hash(par->min_luma));
+ pl_hash_merge(&sig, pl_var_hash(par->max_luma));
+ pl_hash_merge(&sig, pl_var_hash(par->constants));
+ return sig;
+}
+
+static void sh_color_map_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_color_map_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->tone.lut);
+ pl_shader_obj_destroy(&obj->gamut.lut);
+ pl_buf_destroy(gpu, &obj->peak.buf);
+ pl_buf_destroy(gpu, &obj->peak.readback);
+ memset(obj, 0, sizeof(*obj));
+}
+
+static inline float iir_coeff(float rate)
+{
+ if (!rate)
+ return 1.0f;
+ return 1.0f - expf(-1.0f / rate);
+}
+
+static float measure_peak(const struct peak_buf_data *data, float percentile)
+{
+ unsigned frame_max_pq = data->frame_max_pq[0];
+ for (int k = 1; k < SLICES; k++)
+ frame_max_pq = PL_MAX(frame_max_pq, data->frame_max_pq[k]);
+ const float frame_max = (float) frame_max_pq / PQ_MAX;
+ if (percentile <= 0 || percentile >= 100)
+ return frame_max;
+ unsigned total_pixels = 0;
+ for (int k = 0; k < SLICES; k++) {
+ for (int i = 0; i < HIST_BINS; i++)
+ total_pixels += data->frame_hist[k][i];
+ }
+ if (!total_pixels) // no histogram data available?
+ return frame_max;
+
+ const unsigned target_pixel = ceilf(percentile / 100.0f * total_pixels);
+ if (target_pixel >= total_pixels)
+ return frame_max;
+
+ unsigned sum = 0;
+ for (int i = 0; i < HIST_BINS; i++) {
+ unsigned next = sum;
+ for (int k = 0; k < SLICES; k++)
+ next += data->frame_hist[k][i];
+ if (next < target_pixel) {
+ sum = next;
+ continue;
+ }
+
+ // Upper and lower frequency boundaries of the matching histogram bin
+ const unsigned count_low = sum; // last pixel of previous bin
+ const unsigned count_high = next + 1; // first pixel of next bin
+ pl_assert(count_low < target_pixel && target_pixel < count_high);
+
+ // PQ luminance associated with count_low/high respectively
+ const float pq_low = (float) HIST_PQ(i) / PQ_MAX;
+ float pq_high = (float) HIST_PQ(i + 1) / PQ_MAX;
+ if (count_high > total_pixels) // special case for last histogram bin
+ pq_high = frame_max;
+
+ // Position of `target_pixel` inside this bin, assumes pixels are
+ // equidistributed inside a histogram bin
+ const float ratio = (float) (target_pixel - count_low) /
+ (count_high - count_low);
+ return PL_MIX(pq_low, pq_high, ratio);
+ }
+
+ pl_unreachable();
+}
+
+// if `force` is true, ensures the buffer is read, even if `allow_delayed`
+static void update_peak_buf(pl_gpu gpu, struct sh_color_map_obj *obj, bool force)
+{
+ const struct pl_peak_detect_params *params = &obj->peak.params;
+ if (!obj->peak.buf)
+ return;
+
+ if (!force && params->allow_delayed && pl_buf_poll(gpu, obj->peak.buf, 0))
+ return; // buffer not ready yet
+
+ bool ok;
+ struct peak_buf_data data = {0};
+ if (obj->peak.readback) {
+ pl_buf_copy(gpu, obj->peak.readback, 0, obj->peak.buf, 0, sizeof(data));
+ ok = pl_buf_read(gpu, obj->peak.readback, 0, &data, sizeof(data));
+ } else {
+ ok = pl_buf_read(gpu, obj->peak.buf, 0, &data, sizeof(data));
+ }
+ if (ok && data.frame_wg_count[0] > 0) {
+ // Peak detection completed successfully
+ pl_buf_destroy(gpu, &obj->peak.buf);
+ } else {
+ // No data read? Possibly this peak obj has not been executed yet
+ if (!ok) {
+ PL_ERR(gpu, "Failed reading peak detection buffer!");
+ } else if (params->allow_delayed) {
+ PL_TRACE(gpu, "Peak detection buffer not yet ready, ignoring..");
+ } else {
+ PL_WARN(gpu, "Peak detection usage error: attempted detecting peak "
+ "and using detected peak in the same shader program, "
+ "but `params->allow_delayed` is false! Ignoring, but "
+ "expect incorrect output.");
+ }
+ if (force || !ok)
+ pl_buf_destroy(gpu, &obj->peak.buf);
+ return;
+ }
+
+ uint64_t frame_sum_pq = 0u, frame_wg_count = 0u, frame_wg_active = 0u;
+ for (int k = 0; k < SLICES; k++) {
+ frame_sum_pq += data.frame_sum_pq[k];
+ frame_wg_count += data.frame_wg_count[k];
+ frame_wg_active += data.frame_wg_active[k];
+ }
+ float avg_pq, max_pq;
+ if (frame_wg_active) {
+ avg_pq = (float) frame_sum_pq / (frame_wg_active * PQ_MAX);
+ max_pq = measure_peak(&data, params->percentile);
+ } else {
+ // Solid black frame
+ avg_pq = max_pq = PL_COLOR_HDR_BLACK;
+ }
+
+ if (!obj->peak.avg_pq) {
+ // Set the initial value accordingly if it contains no data
+ obj->peak.avg_pq = avg_pq;
+ obj->peak.max_pq = max_pq;
+ } else {
+ // Ignore small deviations from existing peak (rounding error)
+ static const float epsilon = 1.0f / PQ_MAX;
+ if (fabsf(avg_pq - obj->peak.avg_pq) < epsilon)
+ avg_pq = obj->peak.avg_pq;
+ if (fabsf(max_pq - obj->peak.max_pq) < epsilon)
+ max_pq = obj->peak.max_pq;
+ }
+
+ // Use an IIR low-pass filter to smooth out the detected values
+ const float coeff = iir_coeff(params->smoothing_period);
+ obj->peak.avg_pq += coeff * (avg_pq - obj->peak.avg_pq);
+ obj->peak.max_pq += coeff * (max_pq - obj->peak.max_pq);
+
+ // Scene change hysteresis
+ if (params->scene_threshold_low > 0 && params->scene_threshold_high > 0) {
+ const float log10_pq = 1e-2f; // experimentally determined approximate
+ const float thresh_low = params->scene_threshold_low * log10_pq;
+ const float thresh_high = params->scene_threshold_high * log10_pq;
+ const float bias = (float) frame_wg_active / frame_wg_count;
+ const float delta = bias * fabsf(avg_pq - obj->peak.avg_pq);
+ const float mix_coeff = pl_smoothstep(thresh_low, thresh_high, delta);
+ obj->peak.avg_pq = PL_MIX(obj->peak.avg_pq, avg_pq, mix_coeff);
+ obj->peak.max_pq = PL_MIX(obj->peak.max_pq, max_pq, mix_coeff);
+ }
+}
+
+bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp,
+ pl_shader_obj *state,
+ const struct pl_peak_detect_params *params)
+{
+ params = PL_DEF(params, &pl_peak_detect_default_params);
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return false;
+
+ pl_gpu gpu = SH_GPU(sh);
+ if (!gpu || gpu->limits.max_ssbo_size < sizeof(struct peak_buf_data)) {
+ PL_ERR(sh, "HDR peak detection requires a GPU with support for at "
+ "least %zu bytes of SSBO data (supported: %zu)",
+ sizeof(struct peak_buf_data), gpu ? gpu->limits.max_ssbo_size : 0);
+ return false;
+ }
+
+ const bool use_histogram = params->percentile > 0 && params->percentile < 100;
+ size_t shmem_req = 3 * sizeof(uint32_t);
+ if (use_histogram)
+ shmem_req += sizeof(uint32_t[HIST_BINS]);
+
+ if (!sh_try_compute(sh, 16, 16, true, shmem_req)) {
+ PL_ERR(sh, "HDR peak detection requires compute shaders with support "
+ "for at least %zu bytes of shared memory! (avail: %zu)",
+ shmem_req, sh_glsl(sh).max_shmem_size);
+ return false;
+ }
+
+ struct sh_color_map_obj *obj;
+ obj = SH_OBJ(sh, state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+ sh_color_map_uninit);
+ if (!obj)
+ return false;
+
+ if (peak_detect_params_eq(&obj->peak.params, params)) {
+ update_peak_buf(gpu, obj, true); // prevent over-writing previous frame
+ } else {
+ pl_reset_detected_peak(*state);
+ }
+
+ pl_assert(!obj->peak.buf);
+ static const struct peak_buf_data zero = {0};
+
+retry_ssbo:
+ if (obj->peak.readback) {
+ obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(struct peak_buf_data),
+ .storable = true,
+ .initial_data = &zero,
+ ));
+ } else {
+ obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(struct peak_buf_data),
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .host_readable = true,
+ .storable = true,
+ .initial_data = &zero,
+ ));
+ }
+
+ if (!obj->peak.buf && !obj->peak.readback) {
+ PL_WARN(sh, "Failed creating host-readable peak detection SSBO, "
+ "retrying with fallback buffer");
+ obj->peak.readback = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(struct peak_buf_data),
+ .host_readable = true,
+ ));
+ if (obj->peak.readback)
+ goto retry_ssbo;
+ }
+
+ if (!obj->peak.buf) {
+ SH_FAIL(sh, "Failed creating peak detection SSBO!");
+ return false;
+ }
+
+ obj->peak.params = *params;
+
+ sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = "PeakBuf",
+ .type = PL_DESC_BUF_STORAGE,
+ .access = PL_DESC_ACCESS_READWRITE,
+ },
+ .binding.object = obj->peak.buf,
+ .buffer_vars = (struct pl_buffer_var *) peak_buf_vars,
+ .num_buffer_vars = PL_ARRAY_SIZE(peak_buf_vars),
+ });
+
+ sh_describe(sh, "peak detection");
+ GLSL("// pl_shader_detect_peak \n"
+ "{ \n"
+ "const uint wg_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y; \n"
+ "uint wg_idx = gl_WorkGroupID.y * gl_NumWorkGroups.x + \n"
+ " gl_WorkGroupID.x; \n"
+ "uint slice = wg_idx %% %du; \n"
+ "vec4 color_orig = color; \n",
+ SLICES);
+
+ // For performance, we want to do as few atomic operations on global
+ // memory as possible, so use an atomic in shmem for the work group.
+ ident_t wg_sum = sh_fresh(sh, "wg_sum"),
+ wg_max = sh_fresh(sh, "wg_max"),
+ wg_black = sh_fresh(sh, "wg_black"),
+ wg_hist = NULL_IDENT;
+ GLSLH("shared uint "$", "$", "$"; \n", wg_sum, wg_max, wg_black);
+ if (use_histogram) {
+ wg_hist = sh_fresh(sh, "wg_hist");
+ GLSLH("shared uint "$"[%u]; \n", wg_hist, HIST_BINS);
+ GLSL("for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+ " "$"[i] = 0u; \n",
+ HIST_BINS, wg_hist);
+ }
+ GLSL($" = 0u; "$" = 0u; "$" = 0u; \n"
+ "barrier(); \n",
+ wg_sum, wg_max, wg_black);
+
+ // Decode color into linear light representation
+ pl_color_space_infer(&csp);
+ pl_shader_linearize(sh, &csp);
+
+ // Measure luminance as N-bit PQ
+ GLSL("float luma = dot("$", color.rgb); \n"
+ "luma *= %f; \n"
+ "luma = pow(clamp(luma, 0.0, 1.0), %f); \n"
+ "luma = (%f + %f * luma) / (1.0 + %f * luma); \n"
+ "luma = pow(luma, %f); \n"
+ "luma *= smoothstep(0.0, 1e-2, luma); \n"
+ "uint y_pq = uint(%d.0 * luma); \n",
+ sh_luma_coeffs(sh, &csp),
+ PL_COLOR_SDR_WHITE / 10000.0,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+ PQ_MAX);
+
+ // Update the work group's shared atomics
+ bool has_subgroups = sh_glsl(sh).subgroup_size > 0;
+ if (use_histogram) {
+ GLSL("int bin = (int(y_pq) >> %d) - %d; \n"
+ "bin = clamp(bin, 0, %d); \n",
+ PQ_BITS - HIST_BITS, HIST_BIAS,
+ HIST_BINS - 1);
+ if (has_subgroups) {
+ // Optimize for the very common case of identical histogram bins
+ GLSL("if (subgroupAllEqual(bin)) { \n"
+ " if (subgroupElect()) \n"
+ " atomicAdd("$"[bin], gl_SubgroupSize); \n"
+ "} else { \n"
+ " atomicAdd("$"[bin], 1u); \n"
+ "} \n",
+ wg_hist, wg_hist);
+ } else {
+ GLSL("atomicAdd("$"[bin], 1u); \n", wg_hist);
+ }
+ }
+
+ if (has_subgroups) {
+ GLSL("uint group_sum = subgroupAdd(y_pq); \n"
+ "uint group_max = subgroupMax(y_pq); \n"
+ "uvec4 b = subgroupBallot(y_pq == 0u); \n"
+ "if (subgroupElect()) { \n"
+ " atomicAdd("$", group_sum); \n"
+ " atomicMax("$", group_max); \n"
+ " atomicAdd("$", subgroupBallotBitCount(b));\n"
+ "} \n"
+ "barrier(); \n",
+ wg_sum, wg_max, wg_black);
+ } else {
+ GLSL("atomicAdd("$", y_pq); \n"
+ "atomicMax("$", y_pq); \n"
+ "if (y_pq == 0u) \n"
+ " atomicAdd("$", 1u); \n"
+ "barrier(); \n",
+ wg_sum, wg_max, wg_black);
+ }
+
+ if (use_histogram) {
+ GLSL("if (gl_LocalInvocationIndex == 0u) \n"
+ " "$"[0] -= "$"; \n"
+ "for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+ " atomicAdd(frame_hist[slice * %du + i], "$"[i]); \n",
+ wg_hist, wg_black,
+ HIST_BINS,
+ HIST_BINS, wg_hist);
+ }
+
+ // Have one thread per work group update the global atomics
+ GLSL("if (gl_LocalInvocationIndex == 0u) { \n"
+ " uint num = wg_size - "$"; \n"
+ " atomicAdd(frame_wg_count[slice], 1u); \n"
+ " atomicAdd(frame_wg_active[slice], min(num, 1u)); \n"
+ " if (num > 0u) { \n"
+ " atomicAdd(frame_sum_pq[slice], "$" / num); \n"
+ " atomicMax(frame_max_pq[slice], "$"); \n"
+ " } \n"
+ "} \n"
+ "color = color_orig; \n"
+ "} \n",
+ wg_black, wg_sum, wg_max);
+
+ return true;
+}
+
+bool pl_get_detected_hdr_metadata(const pl_shader_obj state,
+ struct pl_hdr_metadata *out)
+{
+ if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+ return false;
+
+ struct sh_color_map_obj *obj = state->priv;
+ update_peak_buf(state->gpu, obj, false);
+ if (!obj->peak.avg_pq)
+ return false;
+
+ out->max_pq_y = obj->peak.max_pq;
+ out->avg_pq_y = obj->peak.avg_pq;
+ return true;
+}
+
+bool pl_get_detected_peak(const pl_shader_obj state,
+ float *out_peak, float *out_avg)
+{
+ struct pl_hdr_metadata data;
+ if (!pl_get_detected_hdr_metadata(state, &data))
+ return false;
+
+ // Preserves old behavior
+ *out_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.max_pq_y);
+ *out_avg = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.avg_pq_y);
+ return true;
+}
+
+void pl_reset_detected_peak(pl_shader_obj state)
+{
+ if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+ return;
+
+ struct sh_color_map_obj *obj = state->priv;
+ pl_buf readback = obj->peak.readback;
+ pl_buf_destroy(state->gpu, &obj->peak.buf);
+ memset(&obj->peak, 0, sizeof(obj->peak));
+ obj->peak.readback = readback;
+}
+
+void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ sh_describe(sh, "feature extraction");
+ pl_shader_linearize(sh, &csp);
+ GLSL("// pl_shader_extract_features \n"
+ "{ \n"
+ "vec3 lms = %f * "$" * color.rgb; \n"
+ "lms = pow(max(lms, 0.0), vec3(%f)); \n"
+ "lms = (vec3(%f) + %f * lms) \n"
+ " / (vec3(1.0) + %f * lms); \n"
+ "lms = pow(lms, vec3(%f)); \n"
+ "float I = dot(vec3(%f, %f, %f), lms); \n"
+ "color = vec4(I, 0.0, 0.0, 1.0); \n"
+ "} \n",
+ PL_COLOR_SDR_WHITE / 10000,
+ SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))),
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+ pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]);
+}
+
+const struct pl_color_map_params pl_color_map_default_params = { PL_COLOR_MAP_DEFAULTS };
+const struct pl_color_map_params pl_color_map_high_quality_params = { PL_COLOR_MAP_HQ_DEFAULTS };
+
+static ident_t rect_pos(pl_shader sh, pl_rect2df rc)
+{
+ if (!rc.x0 && !rc.x1)
+ rc.x1 = 1.0f;
+ if (!rc.y0 && !rc.y1)
+ rc.y1 = 1.0f;
+
+ return sh_attr_vec2(sh, "tone_map_coords", &(pl_rect2df) {
+ .x0 = -rc.x0 / (rc.x1 - rc.x0),
+ .x1 = (1.0f - rc.x0) / (rc.x1 - rc.x0),
+ .y0 = -rc.y1 / (rc.y0 - rc.y1),
+ .y1 = (1.0f - rc.y1) / (rc.y0 - rc.y1),
+ });
+}
+
+static void visualize_tone_map(pl_shader sh, pl_rect2df rc, float alpha,
+ const struct pl_tone_map_params *params)
+{
+ pl_assert(params->input_scaling == PL_HDR_PQ);
+ pl_assert(params->output_scaling == PL_HDR_PQ);
+
+ GLSL("// Visualize tone mapping \n"
+ "{ \n"
+ "vec2 pos = "$"; \n"
+ "if (min(pos.x, pos.y) >= 0.0 && \n" // visualizer rect
+ " max(pos.x, pos.y) <= 1.0) \n"
+ "{ \n"
+ "float xmin = "$"; \n"
+ "float xmax = "$"; \n"
+ "float xavg = "$"; \n"
+ "float ymin = "$"; \n"
+ "float ymax = "$"; \n"
+ "float alpha = 0.8 * "$"; \n"
+ "vec3 viz = color.rgb; \n"
+ "float vv = tone_map(pos.x); \n"
+ // Color based on region
+ "if (pos.x < xmin || pos.x > xmax) { \n" // outside source
+ "} else if (pos.y < ymin || pos.y > ymax) {\n" // outside target
+ " if (pos.y < xmin || pos.y > xmax) { \n" // and also source
+ " viz = vec3(0.1, 0.1, 0.5); \n"
+ " } else { \n"
+ " viz = vec3(0.2, 0.05, 0.05); \n" // but inside source
+ " } \n"
+ "} else { \n" // inside domain
+ " if (abs(pos.x - pos.y) < 1e-3) { \n" // main diagonal
+ " viz = vec3(0.2); \n"
+ " } else if (pos.y < vv) { \n" // inside function
+ " alpha *= 0.6; \n"
+ " viz = vec3(0.05); \n"
+ " if (vv > pos.x && pos.y > pos.x) \n" // output brighter than input
+ " viz.rg = vec2(0.5, 0.7); \n"
+ " } else { \n" // outside function
+ " if (vv < pos.x && pos.y < pos.x) \n" // output darker than input
+ " viz = vec3(0.0, 0.1, 0.2); \n"
+ " } \n"
+ " if (pos.y > xmax) { \n" // inverse tone-mapping region
+ " vec3 hi = vec3(0.2, 0.5, 0.8); \n"
+ " viz = mix(viz, hi, 0.5); \n"
+ " } else if (pos.y < xmin) { \n" // black point region
+ " viz = mix(viz, vec3(0.0), 0.3); \n"
+ " } \n"
+ " if (xavg > 0.0 && abs(pos.x - xavg) < 1e-3)\n" // source avg brightness
+ " viz = vec3(0.5); \n"
+ "} \n"
+ "color.rgb = mix(color.rgb, viz, alpha); \n"
+ "} \n"
+ "} \n",
+ rect_pos(sh, rc),
+ SH_FLOAT_DYN(params->input_min),
+ SH_FLOAT_DYN(params->input_max),
+ SH_FLOAT_DYN(params->input_avg),
+ SH_FLOAT(params->output_min),
+ SH_FLOAT_DYN(params->output_max),
+ SH_FLOAT_DYN(alpha));
+}
+
+static void visualize_gamut_map(pl_shader sh, pl_rect2df rc,
+ ident_t lut, float hue, float theta,
+ const struct pl_gamut_map_params *params)
+{
+ ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+ ident_t lms2rgb_src = SH_MAT3(pl_ipt_lms2rgb(&params->input_gamut));
+ ident_t lms2rgb_dst = SH_MAT3(pl_ipt_lms2rgb(&params->output_gamut));
+
+ GLSL("// Visualize gamut mapping \n"
+ "vec2 pos = "$"; \n"
+ "float pqmin = "$"; \n"
+ "float pqmax = "$"; \n"
+ "float rgbmin = "$"; \n"
+ "float rgbmax = "$"; \n"
+ "vec3 orig = ipt; \n"
+ "if (min(pos.x, pos.y) >= 0.0 && \n"
+ " max(pos.x, pos.y) <= 1.0) \n"
+ "{ \n"
+ // Source color to visualize
+ "float mid = mix(pqmin, pqmax, 0.6); \n"
+ "vec3 base = vec3(0.5, 0.0, 0.0); \n"
+ "float hue = "$", theta = "$"; \n"
+ "base.x = mix(base.x, mid, sin(theta)); \n"
+ "mat3 rot1 = mat3(1.0, 0.0, 0.0, \n"
+ " 0.0, cos(hue), sin(hue), \n"
+ " 0.0, -sin(hue), cos(hue)); \n"
+ "mat3 rot2 = mat3( cos(theta), 0.0, sin(theta), \n"
+ " 0.0, 1.0, 0.0, \n"
+ " -sin(theta), 0.0, cos(theta)); \n"
+ "vec3 dir = vec3(pos.yx - vec2(0.5), 0.0); \n"
+ "ipt = base + rot1 * rot2 * dir; \n"
+ // Convert back to RGB (for gamut boundary testing)
+ "lmspq = "$" * ipt; \n"
+ "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n"
+ "lms = max(lms - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - %f * lms); \n"
+ "lms = pow(lms, vec3(1.0/%f)); \n"
+ "lms *= %f; \n"
+ // Check against src/dst gamut boundaries
+ "vec3 rgbsrc = "$" * lms; \n"
+ "vec3 rgbdst = "$" * lms; \n"
+ "bool insrc, indst; \n"
+ "insrc = all(lessThan(rgbsrc, vec3(rgbmax))) && \n"
+ " all(greaterThan(rgbsrc, vec3(rgbmin))); \n"
+ "indst = all(lessThan(rgbdst, vec3(rgbmax))) && \n"
+ " all(greaterThan(rgbdst, vec3(rgbmin))); \n"
+ // Sample from gamut mapping 3DLUT
+ "idx.x = (ipt.x - pqmin) / (pqmax - pqmin); \n"
+ "idx.y = 2.0 * length(ipt.yz); \n"
+ "idx.z = %f * atan(ipt.z, ipt.y) + 0.5; \n"
+ "vec3 mapped = "$"(idx).xyz; \n"
+ "mapped.yz -= vec2(32768.0/65535.0); \n"
+ "float mappedhue = atan(mapped.z, mapped.y); \n"
+ "float mappedchroma = length(mapped.yz); \n"
+ "ipt = mapped; \n"
+ // Visualize gamuts
+ "if (!insrc && !indst) { \n"
+ " ipt = orig; \n"
+ "} else if (insrc && !indst) { \n"
+ " ipt.x -= 0.1; \n"
+ "} else if (indst && !insrc) { \n"
+ " ipt.x += 0.1; \n"
+ "} \n"
+ // Visualize iso-luminance and iso-hue lines
+ "vec3 line; \n"
+ "if (insrc && fract(50.0 * mapped.x) < 1e-1) { \n"
+ " float k = smoothstep(0.1, 0.0, abs(sin(theta))); \n"
+ " line.x = mix(mapped.x, 0.3, 0.5); \n"
+ " line.yz = sqrt(length(mapped.yz)) * \n"
+ " normalize(mapped.yz); \n"
+ " ipt = mix(ipt, line, k); \n"
+ "} \n"
+ "if (insrc && fract(10.0 * (mappedhue - hue)) < 1e-1) {\n"
+ " float k = smoothstep(0.3, 0.0, abs(cos(theta))); \n"
+ " line.x = mapped.x - 0.05; \n"
+ " line.yz = 1.2 * mapped.yz; \n"
+ " ipt = mix(ipt, line, k); \n"
+ "} \n"
+ "if (insrc && fract(100.0 * mappedchroma) < 1e-1) { \n"
+ " line.x = mapped.x + 0.1; \n"
+ " line.yz = 0.4 * mapped.yz; \n"
+ " ipt = mix(ipt, line, 0.5); \n"
+ "} \n"
+ "} \n",
+ rect_pos(sh, rc),
+ SH_FLOAT(params->min_luma), SH_FLOAT(params->max_luma),
+ SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->min_luma)),
+ SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->max_luma)),
+ SH_FLOAT_DYN(hue), SH_FLOAT_DYN(theta),
+ ipt2lms,
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ 10000 / PL_COLOR_SDR_WHITE,
+ lms2rgb_src,
+ lms2rgb_dst,
+ 0.5f / M_PI,
+ lut);
+}
+
+static void fill_tone_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct pl_tone_map_params *lut_params = params->priv;
+ pl_tone_map_generate(data, lut_params);
+}
+
+static void fill_gamut_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct pl_gamut_map_params *lut_params = params->priv;
+ const int lut_size = params->width * params->height * params->depth;
+ void *tmp = pl_alloc(NULL, lut_size * sizeof(float) * lut_params->lut_stride);
+ pl_gamut_map_generate(tmp, lut_params);
+
+ // Convert to 16-bit unsigned integer for GPU texture
+ const float *in = tmp;
+ uint16_t *out = data;
+ pl_assert(lut_params->lut_stride == 3);
+ pl_assert(params->comps == 4);
+ for (int i = 0; i < lut_size; i++) {
+ out[0] = roundf(in[0] * UINT16_MAX);
+ out[1] = roundf(in[1] * UINT16_MAX + (UINT16_MAX >> 1));
+ out[2] = roundf(in[2] * UINT16_MAX + (UINT16_MAX >> 1));
+ in += 3;
+ out += 4;
+ }
+
+ pl_free(tmp);
+}
+
+void pl_shader_color_map_ex(pl_shader sh, const struct pl_color_map_params *params,
+ const struct pl_color_map_args *args)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ struct pl_color_space src = args->src, dst = args->dst;
+ pl_color_space_infer_map(&src, &dst);
+ if (pl_color_space_equal(&src, &dst)) {
+ if (args->prelinearized)
+ pl_shader_delinearize(sh, &dst);
+ return;
+ }
+
+ struct sh_color_map_obj *obj = NULL;
+ if (args->state) {
+ pl_get_detected_hdr_metadata(*args->state, &src.hdr);
+ obj = SH_OBJ(sh, args->state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+ sh_color_map_uninit);
+ if (!obj)
+ return;
+ }
+
+ params = PL_DEF(params, &pl_color_map_default_params);
+ GLSL("// pl_shader_color_map \n"
+ "{ \n");
+
+ struct pl_tone_map_params tone = {
+ .function = PL_DEF(params->tone_mapping_function, &pl_tone_map_clip),
+ .constants = params->tone_constants,
+ .param = params->tone_mapping_param,
+ .input_scaling = PL_HDR_PQ,
+ .output_scaling = PL_HDR_PQ,
+ .lut_size = PL_DEF(params->lut_size, pl_color_map_default_params.lut_size),
+ .hdr = src.hdr,
+ };
+
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &src,
+ .metadata = params->metadata,
+ .scaling = tone.input_scaling,
+ .out_min = &tone.input_min,
+ .out_max = &tone.input_max,
+ .out_avg = &tone.input_avg,
+ ));
+
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &dst,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = tone.output_scaling,
+ .out_min = &tone.output_min,
+ .out_max = &tone.output_max,
+ ));
+
+ pl_tone_map_params_infer(&tone);
+
+ // Round sufficiently similar values
+ if (fabs(tone.input_max - tone.output_max) < 1e-6)
+ tone.output_max = tone.input_max;
+ if (fabs(tone.input_min - tone.output_min) < 1e-6)
+ tone.output_min = tone.input_min;
+
+ if (!params->inverse_tone_mapping) {
+ // Never exceed the source unless requested, but still allow
+ // black point adaptation
+ tone.output_max = PL_MIN(tone.output_max, tone.input_max);
+ }
+
+ const int *lut3d_size_def = pl_color_map_default_params.lut3d_size;
+ struct pl_gamut_map_params gamut = {
+ .function = PL_DEF(params->gamut_mapping, &pl_gamut_map_clip),
+ .constants = params->gamut_constants,
+ .input_gamut = src.hdr.prim,
+ .output_gamut = dst.hdr.prim,
+ .lut_size_I = PL_DEF(params->lut3d_size[0], lut3d_size_def[0]),
+ .lut_size_C = PL_DEF(params->lut3d_size[1], lut3d_size_def[1]),
+ .lut_size_h = PL_DEF(params->lut3d_size[2], lut3d_size_def[2]),
+ .lut_stride = 3,
+ };
+
+ float src_peak_static;
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &src,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_PQ,
+ .out_max = &src_peak_static,
+ ));
+
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &dst,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_PQ,
+ .out_min = &gamut.min_luma,
+ .out_max = &gamut.max_luma,
+ ));
+
+ // Clip the gamut mapping output to the input gamut if disabled
+ if (!params->gamut_expansion && gamut.function->bidirectional) {
+ if (pl_primaries_compatible(&gamut.input_gamut, &gamut.output_gamut)) {
+ gamut.output_gamut = pl_primaries_clip(&gamut.output_gamut,
+ &gamut.input_gamut);
+ }
+ }
+
+ // Backwards compatibility with older API
+ switch (params->gamut_mode) {
+ case PL_GAMUT_CLIP:
+ switch (params->intent) {
+ case PL_INTENT_AUTO:
+ case PL_INTENT_PERCEPTUAL:
+ case PL_INTENT_RELATIVE_COLORIMETRIC:
+ break; // leave default
+ case PL_INTENT_SATURATION:
+ gamut.function = &pl_gamut_map_saturation;
+ break;
+ case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+ gamut.function = &pl_gamut_map_absolute;
+ break;
+ }
+ break;
+ case PL_GAMUT_DARKEN:
+ gamut.function = &pl_gamut_map_darken;
+ break;
+ case PL_GAMUT_WARN:
+ gamut.function = &pl_gamut_map_highlight;
+ break;
+ case PL_GAMUT_DESATURATE:
+ gamut.function = &pl_gamut_map_desaturate;
+ break;
+ case PL_GAMUT_MODE_COUNT:
+ pl_unreachable();
+ }
+
+ bool can_fast = !params->force_tone_mapping_lut;
+ if (!args->state) {
+ // No state object provided, forcibly disable advanced methods
+ can_fast = true;
+ if (tone.function != &pl_tone_map_clip)
+ tone.function = &pl_tone_map_linear;
+ if (gamut.function != &pl_gamut_map_clip)
+ gamut.function = &pl_gamut_map_saturation;
+ }
+
+ pl_fmt gamut_fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+ if (!gamut_fmt) {
+ gamut.function = &pl_gamut_map_saturation;
+ can_fast = true;
+ }
+
+ bool need_tone_map = !pl_tone_map_params_noop(&tone);
+ bool need_gamut_map = !pl_gamut_map_params_noop(&gamut);
+
+ if (!args->prelinearized)
+ pl_shader_linearize(sh, &src);
+
+ pl_matrix3x3 rgb2lms = pl_ipt_rgb2lms(pl_raw_primaries_get(src.primaries));
+ pl_matrix3x3 lms2rgb = pl_ipt_lms2rgb(pl_raw_primaries_get(dst.primaries));
+ ident_t lms2ipt = SH_MAT3(pl_ipt_lms2ipt);
+ ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+
+ if (need_gamut_map && gamut.function == &pl_gamut_map_saturation && can_fast) {
+ const pl_matrix3x3 lms2src = pl_ipt_lms2rgb(&gamut.input_gamut);
+ const pl_matrix3x3 dst2lms = pl_ipt_rgb2lms(&gamut.output_gamut);
+ sh_describe(sh, "gamut map (saturation)");
+ pl_matrix3x3_mul(&lms2rgb, &dst2lms);
+ pl_matrix3x3_mul(&lms2rgb, &lms2src);
+ need_gamut_map = false;
+ }
+
+ // Fast path: simply convert between primaries (if needed)
+ if (!need_tone_map && !need_gamut_map) {
+ if (src.primaries != dst.primaries) {
+ sh_describe(sh, "colorspace conversion");
+ pl_matrix3x3_mul(&lms2rgb, &rgb2lms);
+ GLSL("color.rgb = "$" * color.rgb; \n", SH_MAT3(lms2rgb));
+ }
+ goto done;
+ }
+
+ // Full path: convert input from normalized RGB to IPT
+ GLSL("vec3 lms = "$" * color.rgb; \n"
+ "vec3 lmspq = %f * lms; \n"
+ "lmspq = pow(max(lmspq, 0.0), vec3(%f)); \n"
+ "lmspq = (vec3(%f) + %f * lmspq) \n"
+ " / (vec3(1.0) + %f * lmspq); \n"
+ "lmspq = pow(lmspq, vec3(%f)); \n"
+ "vec3 ipt = "$" * lmspq; \n"
+ "float i_orig = ipt.x; \n",
+ SH_MAT3(rgb2lms),
+ PL_COLOR_SDR_WHITE / 10000,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+ lms2ipt);
+
+ if (params->show_clipping) {
+ const float eps = 1e-6f;
+ GLSL("bool clip_hi, clip_lo; \n"
+ "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n"
+ "clip_lo = any(lessThan(color.rgb, vec3("$"))); \n"
+ "clip_hi = clip_hi || ipt.x > "$"; \n"
+ "clip_lo = clip_lo || ipt.x < "$"; \n",
+ SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps),
+ SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps),
+ SH_FLOAT_DYN(tone.input_max + eps),
+ SH_FLOAT(tone.input_min - eps));
+ }
+
+ if (need_tone_map) {
+ const struct pl_tone_map_function *fun = tone.function;
+ sh_describef(sh, "%s tone map (%.0f -> %.0f)", fun->name,
+ pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.input_max),
+ pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.output_max));
+
+ if (fun == &pl_tone_map_clip && can_fast) {
+
+ GLSL("#define tone_map(x) clamp((x), "$", "$") \n",
+ SH_FLOAT(tone.input_min),
+ SH_FLOAT_DYN(tone.input_max));
+
+ } else if (fun == &pl_tone_map_linear && can_fast) {
+
+ const float gain = tone.constants.exposure;
+ const float scale = tone.input_max - tone.input_min;
+
+ ident_t linfun = sh_fresh(sh, "linear_pq");
+ GLSLH("float "$"(float x) { \n"
+ // Stretch the input range (while clipping)
+ " x = "$" * x + "$"; \n"
+ " x = clamp(x, 0.0, 1.0); \n"
+ " x = "$" * x + "$"; \n"
+ " return x; \n"
+ "} \n",
+ linfun,
+ SH_FLOAT_DYN(gain / scale),
+ SH_FLOAT_DYN(-gain / scale * tone.input_min),
+ SH_FLOAT_DYN(tone.output_max - tone.output_min),
+ SH_FLOAT(tone.output_min));
+
+ GLSL("#define tone_map(x) ("$"(x)) \n", linfun);
+
+ } else {
+
+ pl_assert(obj);
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->tone.lut,
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_AUTO,
+ .method = SH_LUT_LINEAR,
+ .width = tone.lut_size,
+ .comps = 1,
+ .update = !pl_tone_map_params_equal(&tone, &obj->tone.params),
+ .dynamic = tone.input_avg > 0, // dynamic metadata
+ .fill = fill_tone_lut,
+ .priv = &tone,
+ ));
+ obj->tone.params = tone;
+ if (!lut) {
+ SH_FAIL(sh, "Failed generating tone-mapping LUT!");
+ return;
+ }
+
+ const float lut_range = tone.input_max - tone.input_min;
+ GLSL("#define tone_map(x) ("$"("$" * (x) + "$")) \n",
+ lut, SH_FLOAT_DYN(1.0f / lut_range),
+ SH_FLOAT_DYN(-tone.input_min / lut_range));
+
+ }
+
+ bool need_recovery = tone.input_max >= tone.output_max;
+ if (need_recovery && params->contrast_recovery && args->feature_map) {
+ ident_t pos, pt;
+ ident_t lowres = sh_bind(sh, args->feature_map, PL_TEX_ADDRESS_CLAMP,
+ PL_TEX_SAMPLE_LINEAR, "feature_map",
+ NULL, &pos, &pt);
+
+ // Obtain HF detail map from bicubic interpolation of LF features
+ GLSL("vec2 lpos = "$"; \n"
+ "vec2 lpt = "$"; \n"
+ "vec2 lsize = vec2(textureSize("$", 0)); \n"
+ "vec2 frac = fract(lpos * lsize + vec2(0.5)); \n"
+ "vec2 frac2 = frac * frac; \n"
+ "vec2 inv = vec2(1.0) - frac; \n"
+ "vec2 inv2 = inv * inv; \n"
+ "vec2 w0 = 1.0/6.0 * inv2 * inv; \n"
+ "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \n"
+ "vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n"
+ "vec2 w3 = 1.0/6.0 * frac2 * frac; \n"
+ "vec4 g = vec4(w0 + w1, w2 + w3); \n"
+ "vec4 h = vec4(w1, w3) / g + inv.xyxy; \n"
+ "h.xy -= vec2(2.0); \n"
+ "vec4 p = lpos.xyxy + lpt.xyxy * h; \n"
+ "float l00 = textureLod("$", p.xy, 0.0).r; \n"
+ "float l01 = textureLod("$", p.xw, 0.0).r; \n"
+ "float l0 = mix(l01, l00, g.y); \n"
+ "float l10 = textureLod("$", p.zy, 0.0).r; \n"
+ "float l11 = textureLod("$", p.zw, 0.0).r; \n"
+ "float l1 = mix(l11, l10, g.y); \n"
+ "float luma = mix(l1, l0, g.x); \n"
+ // Mix low-resolution tone mapped image with high-resolution
+ // tone mapped image according to desired strength.
+ "float highres = clamp(ipt.x, 0.0, 1.0); \n"
+ "float lowres = clamp(luma, 0.0, 1.0); \n"
+ "float detail = highres - lowres; \n"
+ "float base = tone_map(highres); \n"
+ "float sharp = tone_map(lowres) + detail; \n"
+ "ipt.x = clamp(mix(base, sharp, "$"), "$", "$"); \n",
+ pos, pt, lowres,
+ lowres, lowres, lowres, lowres,
+ SH_FLOAT(params->contrast_recovery),
+ SH_FLOAT(tone.output_min), SH_FLOAT_DYN(tone.output_max));
+
+ } else {
+
+ GLSL("ipt.x = tone_map(ipt.x); \n");
+ }
+
+ // Avoid raising saturation excessively when raising brightness, and
+ // also desaturate when reducing brightness greatly to account for the
+ // reduction in gamut volume.
+ GLSL("vec2 hull = vec2(i_orig, ipt.x); \n"
+ "hull = ((hull - 6.0) * hull + 9.0) * hull; \n"
+ "ipt.yz *= min(i_orig / ipt.x, hull.y / hull.x); \n");
+ }
+
+ if (need_gamut_map) {
+ const struct pl_gamut_map_function *fun = gamut.function;
+ sh_describef(sh, "gamut map (%s)", fun->name);
+
+ pl_assert(obj);
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->gamut.lut,
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .fmt = gamut_fmt,
+ .method = params->lut3d_tricubic ? SH_LUT_CUBIC : SH_LUT_LINEAR,
+ .width = gamut.lut_size_I,
+ .height = gamut.lut_size_C,
+ .depth = gamut.lut_size_h,
+ .comps = 4,
+ .signature = gamut_map_signature(&gamut),
+ .cache = SH_CACHE(sh),
+ .fill = fill_gamut_lut,
+ .priv = &gamut,
+ ));
+ if (!lut) {
+ SH_FAIL(sh, "Failed generating gamut-mapping LUT!");
+ return;
+ }
+
+ // 3D LUT lookup (in ICh space)
+ const float lut_range = gamut.max_luma - gamut.min_luma;
+ GLSL("vec3 idx; \n"
+ "idx.x = "$" * ipt.x + "$"; \n"
+ "idx.y = 2.0 * length(ipt.yz); \n"
+ "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;\n"
+ "ipt = "$"(idx).xyz; \n"
+ "ipt.yz -= vec2(32768.0/65535.0); \n",
+ SH_FLOAT(1.0f / lut_range),
+ SH_FLOAT(-gamut.min_luma / lut_range),
+ 0.5f / M_PI, lut);
+
+ if (params->show_clipping) {
+ GLSL("clip_lo = clip_lo || any(lessThan(idx, vec3(0.0))); \n"
+ "clip_hi = clip_hi || any(greaterThan(idx, vec3(1.0))); \n");
+ }
+
+ if (params->visualize_lut) {
+ visualize_gamut_map(sh, params->visualize_rect, lut,
+ params->visualize_hue, params->visualize_theta,
+ &gamut);
+ }
+ }
+
+ // Convert IPT back to linear RGB
+ GLSL("lmspq = "$" * ipt; \n"
+ "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n"
+ "lms = max(lms - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - %f * lms); \n"
+ "lms = pow(lms, vec3(1.0/%f)); \n"
+ "lms *= %f; \n"
+ "color.rgb = "$" * lms; \n",
+ ipt2lms,
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ 10000 / PL_COLOR_SDR_WHITE,
+ SH_MAT3(lms2rgb));
+
+ if (params->show_clipping) {
+ GLSL("if (clip_hi) { \n"
+ " float k = dot(color.rgb, vec3(2.0 / 3.0)); \n"
+ " color.rgb = clamp(vec3(k) - color.rgb, 0.0, 1.0); \n"
+ " float cmin = min(min(color.r, color.g), color.b); \n"
+ " float cmax = max(max(color.r, color.g), color.b); \n"
+ " float delta = cmax - cmin; \n"
+ " vec3 sat = smoothstep(cmin - 1e-6, cmax, color.rgb); \n"
+ " const vec3 red = vec3(1.0, 0.0, 0.0); \n"
+ " color.rgb = mix(red, sat, smoothstep(0.0, 0.3, delta)); \n"
+ "} else if (clip_lo) { \n"
+ " vec3 hi = vec3(0.0, 0.3, 0.3); \n"
+ " color.rgb = mix(color.rgb, hi, 0.5); \n"
+ "} \n");
+ }
+
+ if (need_tone_map) {
+ if (params->visualize_lut) {
+ float alpha = need_gamut_map ? powf(cosf(params->visualize_theta), 5.0f) : 1.0f;
+ visualize_tone_map(sh, params->visualize_rect, alpha, &tone);
+ }
+ GLSL("#undef tone_map \n");
+ }
+
+done:
+ pl_shader_delinearize(sh, &dst);
+ GLSL("}\n");
+}
+
+// Backwards compatibility wrapper around `pl_shader_color_map_ex`
+void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params,
+ struct pl_color_space src, struct pl_color_space dst,
+ pl_shader_obj *state, bool prelinearized)
+{
+ pl_shader_color_map_ex(sh, params, pl_color_map_args(
+ .src = src,
+ .dst = dst,
+ .prelinearized = prelinearized,
+ .state = state,
+ .feature_map = NULL
+ ));
+}
+
+void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp,
+ const struct pl_cone_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+ if (!params || !params->cones)
+ return;
+
+ sh_describe(sh, "cone distortion");
+ GLSL("// pl_shader_cone_distort\n");
+ GLSL("{\n");
+
+ pl_color_space_infer(&csp);
+ pl_shader_linearize(sh, &csp);
+
+ pl_matrix3x3 cone_mat;
+ cone_mat = pl_get_cone_matrix(params, pl_raw_primaries_get(csp.primaries));
+ GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("cone_mat"),
+ .data = PL_TRANSPOSE_3X3(cone_mat.m),
+ }));
+
+ pl_shader_delinearize(sh, &csp);
+ GLSL("}\n");
+}
diff --git a/src/shaders/custom.c b/src/shaders/custom.c
new file mode 100644
index 0000000..3f03e57
--- /dev/null
+++ b/src/shaders/custom.c
@@ -0,0 +1,89 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/custom.h>
+
+bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params)
+{
+ if (params->compute) {
+ int bw = PL_DEF(params->compute_group_size[0], 16);
+ int bh = PL_DEF(params->compute_group_size[1], 16);
+ bool flex = !params->compute_group_size[0] ||
+ !params->compute_group_size[1];
+ if (!sh_try_compute(sh, bw, bh, flex, params->compute_shmem))
+ return false;
+ }
+
+ if (!sh_require(sh, params->input, params->output_w, params->output_h))
+ return false;
+
+ sh->output = params->output;
+
+ for (int i = 0; i < params->num_variables; i++) {
+ struct pl_shader_var sv = params->variables[i];
+ GLSLP("#define %s "$"\n", sv.var.name, sh_var(sh, sv));
+ }
+
+ for (int i = 0; i < params->num_descriptors; i++) {
+ struct pl_shader_desc sd = params->descriptors[i];
+ GLSLP("#define %s "$"\n", sd.desc.name, sh_desc(sh, sd));
+ }
+
+ for (int i = 0; i < params->num_vertex_attribs; i++) {
+ struct pl_shader_va sva = params->vertex_attribs[i];
+ GLSLP("#define %s "$"\n", sva.attr.name, sh_attr(sh, sva));
+ }
+
+ for (int i = 0; i < params->num_constants; i++) {
+ struct pl_shader_const sc = params->constants[i];
+ GLSLP("#define %s "$"\n", sc.name, sh_const(sh, sc));
+ }
+
+ if (params->prelude)
+ GLSLP("// pl_shader_custom prelude: \n%s\n", params->prelude);
+ if (params->header)
+ GLSLH("// pl_shader_custom header: \n%s\n", params->header);
+
+ if (params->description)
+ sh_describef(sh, "%s", params->description);
+
+ if (params->body) {
+ const char *output_decl = "";
+ if (params->output != params->input) {
+ switch (params->output) {
+ case PL_SHADER_SIG_NONE: break;
+ case PL_SHADER_SIG_COLOR:
+ output_decl = "vec4 color = vec4(0.0);";
+ break;
+
+ case PL_SHADER_SIG_SAMPLER:
+ pl_unreachable();
+ }
+ }
+
+ GLSL("// pl_shader_custom \n"
+ "%s \n"
+ "{ \n"
+ "%s \n"
+ "} \n",
+ output_decl, params->body);
+ }
+
+ return true;
+}
diff --git a/src/shaders/custom_mpv.c b/src/shaders/custom_mpv.c
new file mode 100644
index 0000000..4ef0817
--- /dev/null
+++ b/src/shaders/custom_mpv.c
@@ -0,0 +1,1768 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "gpu.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/custom.h>
+
+// Hard-coded size limits, mainly for convenience (to avoid dynamic memory)
+#define SHADER_MAX_HOOKS 16
+#define SHADER_MAX_BINDS 16
+#define MAX_SHEXP_SIZE 32
+
+enum shexp_op {
+ SHEXP_OP_ADD,
+ SHEXP_OP_SUB,
+ SHEXP_OP_MUL,
+ SHEXP_OP_DIV,
+ SHEXP_OP_MOD,
+ SHEXP_OP_NOT,
+ SHEXP_OP_GT,
+ SHEXP_OP_LT,
+ SHEXP_OP_EQ,
+};
+
+enum shexp_tag {
+ SHEXP_END = 0, // End of an RPN expression
+ SHEXP_CONST, // Push a constant value onto the stack
+ SHEXP_TEX_W, // Get the width/height of a named texture (variable)
+ SHEXP_TEX_H,
+ SHEXP_OP2, // Pop two elements and push the result of a dyadic operation
+ SHEXP_OP1, // Pop one element and push the result of a monadic operation
+ SHEXP_VAR, // Arbitrary variable (e.g. shader parameters)
+};
+
+struct shexp {
+ enum shexp_tag tag;
+ union {
+ float cval;
+ pl_str varname;
+ enum shexp_op op;
+ } val;
+};
+
+struct custom_shader_hook {
+ // Variable/literal names of textures
+ pl_str pass_desc;
+ pl_str hook_tex[SHADER_MAX_HOOKS];
+ pl_str bind_tex[SHADER_MAX_BINDS];
+ pl_str save_tex;
+
+ // Shader body itself + metadata
+ pl_str pass_body;
+ float offset[2];
+ bool offset_align;
+ int comps;
+
+ // Special expressions governing the output size and execution conditions
+ struct shexp width[MAX_SHEXP_SIZE];
+ struct shexp height[MAX_SHEXP_SIZE];
+ struct shexp cond[MAX_SHEXP_SIZE];
+
+ // Special metadata for compute shaders
+ bool is_compute;
+ int block_w, block_h; // Block size (each block corresponds to one WG)
+ int threads_w, threads_h; // How many threads form a WG
+};
+
+static bool parse_rpn_shexpr(pl_str line, struct shexp out[MAX_SHEXP_SIZE])
+{
+ int pos = 0;
+
+ while (line.len > 0) {
+ pl_str word = pl_str_split_char(line, ' ', &line);
+ if (word.len == 0)
+ continue;
+
+ if (pos >= MAX_SHEXP_SIZE)
+ return false;
+
+ struct shexp *exp = &out[pos++];
+
+ if (pl_str_eatend0(&word, ".w") || pl_str_eatend0(&word, ".width")) {
+ exp->tag = SHEXP_TEX_W;
+ exp->val.varname = word;
+ continue;
+ }
+
+ if (pl_str_eatend0(&word, ".h") || pl_str_eatend0(&word, ".height")) {
+ exp->tag = SHEXP_TEX_H;
+ exp->val.varname = word;
+ continue;
+ }
+
+ switch (word.buf[0]) {
+ case '+': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_ADD; continue;
+ case '-': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_SUB; continue;
+ case '*': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MUL; continue;
+ case '/': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_DIV; continue;
+ case '%': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MOD; continue;
+ case '!': exp->tag = SHEXP_OP1; exp->val.op = SHEXP_OP_NOT; continue;
+ case '>': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_GT; continue;
+ case '<': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_LT; continue;
+ case '=': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_EQ; continue;
+ }
+
+ if (word.buf[0] >= '0' && word.buf[0] <= '9') {
+ exp->tag = SHEXP_CONST;
+ if (!pl_str_parse_float(word, &exp->val.cval))
+ return false;
+ continue;
+ }
+
+ // Treat as generic variable
+ exp->tag = SHEXP_VAR;
+ exp->val.varname = word;
+ }
+
+ return true;
+}
+
+static inline pl_str split_magic(pl_str *body)
+{
+ pl_str ret = pl_str_split_str0(*body, "//!", body);
+ if (body->len) {
+ // Make sure the separator is included in the remainder
+ body->buf -= 3;
+ body->len += 3;
+ }
+
+ return ret;
+}
+
+static bool parse_hook(pl_log log, pl_str *body, struct custom_shader_hook *out)
+{
+ *out = (struct custom_shader_hook){
+ .pass_desc = pl_str0("unknown user shader"),
+ .width = {{ SHEXP_TEX_W, { .varname = pl_str0("HOOKED") }}},
+ .height = {{ SHEXP_TEX_H, { .varname = pl_str0("HOOKED") }}},
+ .cond = {{ SHEXP_CONST, { .cval = 1.0 }}},
+ };
+
+ int hook_idx = 0;
+ int bind_idx = 0;
+
+ // Parse all headers
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ // Check for the presence of the magic line beginning
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ // Parse the supported commands
+ if (pl_str_eatstart0(&line, "HOOK")) {
+ if (hook_idx == SHADER_MAX_HOOKS) {
+ pl_err(log, "Passes may only hook up to %d textures!",
+ SHADER_MAX_HOOKS);
+ return false;
+ }
+ out->hook_tex[hook_idx++] = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "BIND")) {
+ if (bind_idx == SHADER_MAX_BINDS) {
+ pl_err(log, "Passes may only bind up to %d textures!",
+ SHADER_MAX_BINDS);
+ return false;
+ }
+ out->bind_tex[bind_idx++] = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "SAVE")) {
+ pl_str save_tex = pl_str_strip(line);
+ if (pl_str_equals0(save_tex, "HOOKED")) {
+ // This is a special name that means "overwrite existing"
+ // texture, which we just signal by not having any `save_tex`
+ // name set.
+ out->save_tex = (pl_str) {0};
+ } else if (pl_str_equals0(save_tex, "MAIN")) {
+ // Compatibility alias
+ out->save_tex = pl_str0("MAINPRESUB");
+ } else {
+ out->save_tex = save_tex;
+ };
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DESC")) {
+ out->pass_desc = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "OFFSET")) {
+ line = pl_str_strip(line);
+ if (pl_str_equals0(line, "ALIGN")) {
+ out->offset_align = true;
+ } else {
+ if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[0]) ||
+ !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[1]) ||
+ line.len)
+ {
+ pl_err(log, "Error while parsing OFFSET!");
+ return false;
+ }
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "WIDTH")) {
+ if (!parse_rpn_shexpr(line, out->width)) {
+ pl_err(log, "Error while parsing WIDTH!");
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "HEIGHT")) {
+ if (!parse_rpn_shexpr(line, out->height)) {
+ pl_err(log, "Error while parsing HEIGHT!");
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "WHEN")) {
+ if (!parse_rpn_shexpr(line, out->cond)) {
+ pl_err(log, "Error while parsing WHEN!");
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "COMPONENTS")) {
+ if (!pl_str_parse_int(pl_str_strip(line), &out->comps)) {
+ pl_err(log, "Error parsing COMPONENTS: '%.*s'", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "COMPUTE")) {
+ line = pl_str_strip(line);
+ bool ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_w) &&
+ pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_h);
+
+ line = pl_str_strip(line);
+ if (ok && line.len) {
+ ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_w) &&
+ pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_h) &&
+ !line.len;
+ } else {
+ out->threads_w = out->block_w;
+ out->threads_h = out->block_h;
+ }
+
+ if (!ok) {
+ pl_err(log, "Error while parsing COMPUTE!");
+ return false;
+ }
+
+ out->is_compute = true;
+ continue;
+ }
+
+ // Unknown command type
+ pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ // The rest of the file up until the next magic line beginning (if any)
+ // shall be the shader body
+ out->pass_body = split_magic(body);
+
+ // Sanity checking
+ if (hook_idx == 0)
+ pl_warn(log, "Pass has no hooked textures (will be ignored)!");
+
+ return true;
+}
+
+static bool parse_tex(pl_gpu gpu, void *alloc, pl_str *body,
+ struct pl_shader_desc *out)
+{
+ *out = (struct pl_shader_desc) {
+ .desc = {
+ .name = "USER_TEX",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ };
+
+ struct pl_tex_params params = {
+ .w = 1, .h = 1, .d = 0,
+ .sampleable = true,
+ .debug_tag = PL_DEBUG_TAG,
+ };
+
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ if (pl_str_eatstart0(&line, "TEXTURE")) {
+ out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "SIZE")) {
+ line = pl_str_strip(line);
+ int dims = 0;
+ int dim[4]; // extra space to catch invalid extra entries
+ while (line.len && dims < PL_ARRAY_SIZE(dim)) {
+ if (!pl_str_parse_int(pl_str_split_char(line, ' ', &line), &dim[dims++])) {
+ PL_ERR(gpu, "Error while parsing SIZE!");
+ return false;
+ }
+ }
+
+ uint32_t lim = dims == 1 ? gpu->limits.max_tex_1d_dim
+ : dims == 2 ? gpu->limits.max_tex_2d_dim
+ : dims == 3 ? gpu->limits.max_tex_3d_dim
+ : 0;
+
+ // Sanity check against GPU size limits
+ switch (dims) {
+ case 3:
+ params.d = dim[2];
+ if (params.d < 1 || params.d > lim) {
+ PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+ params.d, lim);
+ return false;
+ }
+ // fall through
+ case 2:
+ params.h = dim[1];
+ if (params.h < 1 || params.h > lim) {
+ PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+ params.h, lim);
+ return false;
+ }
+ // fall through
+ case 1:
+ params.w = dim[0];
+ if (params.w < 1 || params.w > lim) {
+ PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+ params.w, lim);
+ return false;
+ }
+ break;
+
+ default:
+ PL_ERR(gpu, "Invalid number of texture dimensions!");
+ return false;
+ };
+
+ // Clear out the superfluous components
+ if (dims < 3)
+ params.d = 0;
+ if (dims < 2)
+ params.h = 0;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "FORMAT")) {
+ line = pl_str_strip(line);
+ params.format = NULL;
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt fmt = gpu->formats[n];
+ if (pl_str_equals0(line, fmt->name)) {
+ params.format = fmt;
+ break;
+ }
+ }
+
+ if (!params.format || params.format->opaque) {
+ PL_ERR(gpu, "Unrecognized/unavailable FORMAT name: '%.*s'!",
+ PL_STR_FMT(line));
+ return false;
+ }
+
+ if (!(params.format->caps & PL_FMT_CAP_SAMPLEABLE)) {
+ PL_ERR(gpu, "Chosen FORMAT '%.*s' is not sampleable!",
+ PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "FILTER")) {
+ line = pl_str_strip(line);
+ if (pl_str_equals0(line, "LINEAR")) {
+ out->binding.sample_mode = PL_TEX_SAMPLE_LINEAR;
+ } else if (pl_str_equals0(line, "NEAREST")) {
+ out->binding.sample_mode = PL_TEX_SAMPLE_NEAREST;
+ } else {
+ PL_ERR(gpu, "Unrecognized FILTER: '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "BORDER")) {
+ line = pl_str_strip(line);
+ if (pl_str_equals0(line, "CLAMP")) {
+ out->binding.address_mode = PL_TEX_ADDRESS_CLAMP;
+ } else if (pl_str_equals0(line, "REPEAT")) {
+ out->binding.address_mode = PL_TEX_ADDRESS_REPEAT;
+ } else if (pl_str_equals0(line, "MIRROR")) {
+ out->binding.address_mode = PL_TEX_ADDRESS_MIRROR;
+ } else {
+ PL_ERR(gpu, "Unrecognized BORDER: '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "STORAGE")) {
+ params.storable = true;
+ out->desc.type = PL_DESC_STORAGE_IMG;
+ out->desc.access = PL_DESC_ACCESS_READWRITE;
+ out->memory = PL_MEMORY_COHERENT;
+ continue;
+ }
+
+ PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ if (!params.format) {
+ PL_ERR(gpu, "No FORMAT specified!");
+ return false;
+ }
+
+ int caps = params.format->caps;
+ if (out->binding.sample_mode == PL_TEX_SAMPLE_LINEAR && !(caps & PL_FMT_CAP_LINEAR)) {
+ PL_ERR(gpu, "The specified texture format cannot be linear filtered!");
+ return false;
+ }
+
+ // Decode the rest of the section (up to the next //! marker) as raw hex
+ // data for the texture
+ pl_str tex, hexdata = split_magic(body);
+ if (!pl_str_decode_hex(NULL, pl_str_strip(hexdata), &tex)) {
+ PL_ERR(gpu, "Error while parsing TEXTURE body: must be a valid "
+ "hexadecimal sequence!");
+ return false;
+ }
+
+ int texels = params.w * PL_DEF(params.h, 1) * PL_DEF(params.d, 1);
+ size_t expected_len = texels * params.format->texel_size;
+ if (tex.len == 0 && params.storable) {
+ // In this case, it's okay that the texture has no initial data
+ pl_free_ptr(&tex.buf);
+ } else if (tex.len != expected_len) {
+ PL_ERR(gpu, "Shader TEXTURE size mismatch: got %zu bytes, expected %zu!",
+ tex.len, expected_len);
+ pl_free(tex.buf);
+ return false;
+ }
+
+ params.initial_data = tex.buf;
+ out->binding.object = pl_tex_create(gpu, &params);
+ pl_free(tex.buf);
+
+ if (!out->binding.object) {
+ PL_ERR(gpu, "Failed creating custom texture!");
+ return false;
+ }
+
+ return true;
+}
+
+static bool parse_buf(pl_gpu gpu, void *alloc, pl_str *body,
+ struct pl_shader_desc *out)
+{
+ *out = (struct pl_shader_desc) {
+ .desc = {
+ .name = "USER_BUF",
+ .type = PL_DESC_BUF_UNIFORM,
+ },
+ };
+
+ // Temporary, to allow deferring variable placement until all headers
+ // have been processed (in order to e.g. determine buffer type)
+ void *tmp = pl_tmp(alloc); // will be freed automatically on failure
+ PL_ARRAY(struct pl_var) vars = {0};
+
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ if (pl_str_eatstart0(&line, "BUFFER")) {
+ out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "STORAGE")) {
+ out->desc.type = PL_DESC_BUF_STORAGE;
+ out->desc.access = PL_DESC_ACCESS_READWRITE;
+ out->memory = PL_MEMORY_COHERENT;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "VAR")) {
+ pl_str type_name = pl_str_split_char(pl_str_strip(line), ' ', &line);
+ struct pl_var var = {0};
+ for (const struct pl_named_var *nv = pl_var_glsl_types; nv->glsl_name; nv++) {
+ if (pl_str_equals0(type_name, nv->glsl_name)) {
+ var = nv->var;
+ break;
+ }
+ }
+
+ if (!var.type) {
+ // No type found
+ PL_ERR(gpu, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(type_name));
+ return false;
+ }
+
+ pl_str var_name = pl_str_split_char(line, '[', &line);
+ if (line.len > 0) {
+ // Parse array dimension
+ if (!pl_str_parse_int(pl_str_split_char(line, ']', NULL), &var.dim_a)) {
+ PL_ERR(gpu, "Failed parsing array dimension from [%.*s!",
+ PL_STR_FMT(line));
+ return false;
+ }
+
+ if (var.dim_a < 1) {
+ PL_ERR(gpu, "Invalid array dimension %d!", var.dim_a);
+ return false;
+ }
+ }
+
+ var.name = pl_strdup0(alloc, pl_str_strip(var_name));
+ PL_ARRAY_APPEND(tmp, vars, var);
+ continue;
+ }
+
+ PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ // Try placing all of the buffer variables
+ for (int i = 0; i < vars.num; i++) {
+ if (!sh_buf_desc_append(alloc, gpu, out, NULL, vars.elem[i])) {
+ PL_ERR(gpu, "Custom buffer exceeds GPU limitations!");
+ return false;
+ }
+ }
+
+ // Decode the rest of the section (up to the next //! marker) as raw hex
+ // data for the buffer
+ pl_str data, hexdata = split_magic(body);
+ if (!pl_str_decode_hex(tmp, pl_str_strip(hexdata), &data)) {
+ PL_ERR(gpu, "Error while parsing BUFFER body: must be a valid "
+ "hexadecimal sequence!");
+ return false;
+ }
+
+ size_t buf_size = sh_buf_desc_size(out);
+ if (data.len == 0 && out->desc.type == PL_DESC_BUF_STORAGE) {
+ // In this case, it's okay that the buffer has no initial data
+ } else if (data.len != buf_size) {
+ PL_ERR(gpu, "Shader BUFFER size mismatch: got %zu bytes, expected %zu!",
+ data.len, buf_size);
+ return false;
+ }
+
+ out->binding.object = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .uniform = out->desc.type == PL_DESC_BUF_UNIFORM,
+ .storable = out->desc.type == PL_DESC_BUF_STORAGE,
+ .initial_data = data.len ? data.buf : NULL,
+ ));
+
+ if (!out->binding.object) {
+ PL_ERR(gpu, "Failed creating custom buffer!");
+ return false;
+ }
+
+ pl_free(tmp);
+ return true;
+}
+
+static bool parse_var(pl_log log, pl_str str, enum pl_var_type type, pl_var_data *out)
+{
+ if (!str.len)
+ return true;
+
+ pl_str buf = str;
+ bool ok = false;
+ switch (type) {
+ case PL_VAR_SINT:
+ ok = pl_str_parse_int(pl_str_split_char(buf, ' ', &buf), &out->i);
+ break;
+ case PL_VAR_UINT:
+ ok = pl_str_parse_uint(pl_str_split_char(buf, ' ', &buf), &out->u);
+ break;
+ case PL_VAR_FLOAT:
+ ok = pl_str_parse_float(pl_str_split_char(buf, ' ', &buf), &out->f);
+ break;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (pl_str_strip(buf).len > 0)
+ ok = false; // left-over garbage
+
+ if (!ok) {
+ pl_err(log, "Failed parsing variable data: %.*s", PL_STR_FMT(str));
+ return false;
+ }
+
+ return true;
+}
+
+static bool check_bounds(pl_log log, enum pl_var_type type, const pl_var_data data,
+ const pl_var_data minimum, const pl_var_data maximum)
+{
+#define CHECK_BOUNDS(v, fmt) do \
+{ \
+ if (data.v < minimum.v) { \
+ pl_err(log, "Initial value "fmt" below declared minimum "fmt"!", \
+ data.v, minimum.v); \
+ return false; \
+ } \
+ if (data.v > maximum.v) { \
+ pl_err(log, "Initial value "fmt" above declared maximum "fmt"!", \
+ data.v, maximum.v); \
+ return false; \
+ } \
+} while (0)
+
+ switch (type) {
+ case PL_VAR_SINT:
+ CHECK_BOUNDS(i, "%d");
+ break;
+ case PL_VAR_UINT:
+ CHECK_BOUNDS(u, "%u");
+ break;
+ case PL_VAR_FLOAT:
+ CHECK_BOUNDS(f, "%f");
+ break;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+#undef CHECK_BOUNDS
+ return true;
+}
+
+static bool parse_param(pl_log log, void *alloc, pl_str *body,
+ struct pl_hook_par *out)
+{
+ *out = (struct pl_hook_par) {0};
+ pl_str minimum = {0};
+ pl_str maximum = {0};
+ bool is_enum = false;
+
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ if (pl_str_eatstart0(&line, "PARAM")) {
+ out->name = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DESC")) {
+ out->description = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "MINIMUM")) {
+ minimum = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "MAXIMUM")) {
+ maximum = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "TYPE")) {
+ line = pl_str_strip(line);
+ is_enum = pl_str_eatstart0(&line, "ENUM");
+ line = pl_str_strip(line);
+ if (pl_str_eatstart0(&line, "DYNAMIC")) {
+ out->mode = PL_HOOK_PAR_DYNAMIC;
+ } else if (pl_str_eatstart0(&line, "CONSTANT")) {
+ out->mode = PL_HOOK_PAR_CONSTANT;
+ } else if (pl_str_eatstart0(&line, "DEFINE")) {
+ out->mode = PL_HOOK_PAR_DEFINE;
+ out->type = PL_VAR_SINT;
+ if (pl_str_strip(line).len > 0) {
+ pl_err(log, "TYPE DEFINE does not take any extra arguments, "
+ "unexpected: '%.*s'", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ } else {
+ out->mode = PL_HOOK_PAR_VARIABLE;
+ }
+
+ line = pl_str_strip(line);
+ for (const struct pl_named_var *nv = pl_var_glsl_types;
+ nv->glsl_name; nv++)
+ {
+ if (pl_str_equals0(line, nv->glsl_name)) {
+ if (nv->var.dim_v > 1 || nv->var.dim_m > 1) {
+ pl_err(log, "GLSL type '%s' is incompatible with "
+ "shader parameters, must be scalar type!",
+ nv->glsl_name);
+ return false;
+ }
+
+ out->type = nv->var.type;
+ if (is_enum && out->type != PL_VAR_SINT) {
+ pl_err(log, "ENUM is only compatible with type int/DEFINE!");
+ return false;
+ }
+ goto next;
+ }
+ }
+
+ pl_err(log, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+
+next: ;
+ }
+
+ switch (out->type) {
+ case PL_VAR_INVALID:
+ pl_err(log, "Missing variable type!");
+ return false;
+ case PL_VAR_SINT:
+ out->minimum.i = INT_MIN;
+ out->maximum.i = INT_MAX;
+ break;
+ case PL_VAR_UINT:
+ out->minimum.u = 0;
+ out->maximum.u = UINT_MAX;
+ break;
+ case PL_VAR_FLOAT:
+ out->minimum.f = -INFINITY;
+ out->maximum.f = INFINITY;
+ break;
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ pl_str initial = pl_str_strip(split_magic(body));
+ if (!initial.len) {
+ pl_err(log, "Missing initial parameter value!");
+ return false;
+ }
+
+ if (is_enum) {
+ PL_ARRAY(const char *) names = {0};
+ pl_assert(out->type == PL_VAR_SINT);
+ do {
+ pl_str line = pl_str_strip(pl_str_getline(initial, &initial));
+ if (!line.len)
+ continue;
+ PL_ARRAY_APPEND(alloc, names, pl_strdup0(alloc, line));
+ } while (initial.len);
+
+ pl_assert(names.num >= 1);
+ out->initial.i = 0;
+ out->minimum.i = 0;
+ out->maximum.i = names.num - 1;
+ out->names = names.elem;
+ } else {
+ if (!parse_var(log, initial, out->type, &out->initial))
+ return false;
+ if (!parse_var(log, minimum, out->type, &out->minimum))
+ return false;
+ if (!parse_var(log, maximum, out->type, &out->maximum))
+ return false;
+ if (!check_bounds(log, out->type, out->initial, out->minimum, out->maximum))
+ return false;
+ }
+
+ out->data = pl_memdup(alloc, &out->initial, sizeof(out->initial));
+ return true;
+}
+
+static enum pl_hook_stage mp_stage_to_pl(pl_str stage)
+{
+ if (pl_str_equals0(stage, "RGB"))
+ return PL_HOOK_RGB_INPUT;
+ if (pl_str_equals0(stage, "LUMA"))
+ return PL_HOOK_LUMA_INPUT;
+ if (pl_str_equals0(stage, "CHROMA"))
+ return PL_HOOK_CHROMA_INPUT;
+ if (pl_str_equals0(stage, "ALPHA"))
+ return PL_HOOK_ALPHA_INPUT;
+ if (pl_str_equals0(stage, "XYZ"))
+ return PL_HOOK_XYZ_INPUT;
+
+ if (pl_str_equals0(stage, "CHROMA_SCALED"))
+ return PL_HOOK_CHROMA_SCALED;
+ if (pl_str_equals0(stage, "ALPHA_SCALED"))
+ return PL_HOOK_ALPHA_SCALED;
+
+ if (pl_str_equals0(stage, "NATIVE"))
+ return PL_HOOK_NATIVE;
+ if (pl_str_equals0(stage, "MAINPRESUB"))
+ return PL_HOOK_RGB;
+ if (pl_str_equals0(stage, "MAIN"))
+ return PL_HOOK_RGB; // Note: conflicts with above!
+
+ if (pl_str_equals0(stage, "LINEAR"))
+ return PL_HOOK_LINEAR;
+ if (pl_str_equals0(stage, "SIGMOID"))
+ return PL_HOOK_SIGMOID;
+ if (pl_str_equals0(stage, "PREKERNEL"))
+ return PL_HOOK_PRE_KERNEL;
+ if (pl_str_equals0(stage, "POSTKERNEL"))
+ return PL_HOOK_POST_KERNEL;
+
+ if (pl_str_equals0(stage, "SCALED"))
+ return PL_HOOK_SCALED;
+ if (pl_str_equals0(stage, "PREOUTPUT"))
+ return PL_HOOK_PRE_OUTPUT;
+ if (pl_str_equals0(stage, "OUTPUT"))
+ return PL_HOOK_OUTPUT;
+
+ return 0;
+}
+
+static pl_str pl_stage_to_mp(enum pl_hook_stage stage)
+{
+ switch (stage) {
+ case PL_HOOK_RGB_INPUT: return pl_str0("RGB");
+ case PL_HOOK_LUMA_INPUT: return pl_str0("LUMA");
+ case PL_HOOK_CHROMA_INPUT: return pl_str0("CHROMA");
+ case PL_HOOK_ALPHA_INPUT: return pl_str0("ALPHA");
+ case PL_HOOK_XYZ_INPUT: return pl_str0("XYZ");
+
+ case PL_HOOK_CHROMA_SCALED: return pl_str0("CHROMA_SCALED");
+ case PL_HOOK_ALPHA_SCALED: return pl_str0("ALPHA_SCALED");
+
+ case PL_HOOK_NATIVE: return pl_str0("NATIVE");
+ case PL_HOOK_RGB: return pl_str0("MAINPRESUB");
+
+ case PL_HOOK_LINEAR: return pl_str0("LINEAR");
+ case PL_HOOK_SIGMOID: return pl_str0("SIGMOID");
+ case PL_HOOK_PRE_KERNEL: return pl_str0("PREKERNEL");
+ case PL_HOOK_POST_KERNEL: return pl_str0("POSTKERNEL");
+
+ case PL_HOOK_SCALED: return pl_str0("SCALED");
+ case PL_HOOK_PRE_OUTPUT: return pl_str0("PREOUTPUT");
+ case PL_HOOK_OUTPUT: return pl_str0("OUTPUT");
+ };
+
+ pl_unreachable();
+}
+
+struct hook_pass {
+ enum pl_hook_stage exec_stages;
+ struct custom_shader_hook hook;
+};
+
+struct pass_tex {
+ pl_str name;
+ pl_tex tex;
+
+ // Metadata
+ pl_rect2df rect;
+ struct pl_color_repr repr;
+ struct pl_color_space color;
+ int comps;
+};
+
+struct hook_priv {
+ pl_log log;
+ pl_gpu gpu;
+ void *alloc;
+
+ PL_ARRAY(struct hook_pass) hook_passes;
+ PL_ARRAY(struct pl_hook_par) hook_params;
+
+ // Fixed (for shader-local resources)
+ PL_ARRAY(struct pl_shader_desc) descriptors;
+
+ // Dynamic per pass
+ enum pl_hook_stage save_stages;
+ PL_ARRAY(struct pass_tex) pass_textures;
+ pl_shader trc_helper;
+
+ // State for PRNG/frame count
+ int frame_count;
+ uint64_t prng_state[4];
+};
+
+static void hook_reset(void *priv)
+{
+ struct hook_priv *p = priv;
+ p->pass_textures.num = 0;
+}
+
+// Context during execution of a hook
+struct hook_ctx {
+ struct hook_priv *priv;
+ const struct pl_hook_params *params;
+ struct pass_tex hooked;
+};
+
+static bool lookup_tex(struct hook_ctx *ctx, pl_str var, float size[2])
+{
+ struct hook_priv *p = ctx->priv;
+ const struct pl_hook_params *params = ctx->params;
+
+ if (pl_str_equals0(var, "HOOKED")) {
+ pl_assert(ctx->hooked.tex);
+ size[0] = ctx->hooked.tex->params.w;
+ size[1] = ctx->hooked.tex->params.h;
+ return true;
+ }
+
+ if (pl_str_equals0(var, "NATIVE_CROPPED")) {
+ size[0] = fabs(pl_rect_w(params->src_rect));
+ size[1] = fabs(pl_rect_h(params->src_rect));
+ return true;
+ }
+
+ if (pl_str_equals0(var, "OUTPUT")) {
+ size[0] = abs(pl_rect_w(params->dst_rect));
+ size[1] = abs(pl_rect_h(params->dst_rect));
+ return true;
+ }
+
+ if (pl_str_equals0(var, "MAIN"))
+ var = pl_str0("MAINPRESUB");
+
+ for (int i = 0; i < p->pass_textures.num; i++) {
+ if (pl_str_equals(var, p->pass_textures.elem[i].name)) {
+ pl_tex tex = p->pass_textures.elem[i].tex;
+ size[0] = tex->params.w;
+ size[1] = tex->params.h;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool lookup_var(struct hook_ctx *ctx, pl_str var, float *val)
+{
+ struct hook_priv *p = ctx->priv;
+ for (int i = 0; i < p->hook_params.num; i++) {
+ const struct pl_hook_par *hp = &p->hook_params.elem[i];
+ if (pl_str_equals0(var, hp->name)) {
+ switch (hp->type) {
+ case PL_VAR_SINT: *val = hp->data->i; return true;
+ case PL_VAR_UINT: *val = hp->data->u; return true;
+ case PL_VAR_FLOAT: *val = hp->data->f; return true;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+ }
+
+ if (hp->names) {
+ for (int j = hp->minimum.i; j <= hp->maximum.i; j++) {
+ if (pl_str_equals0(var, hp->names[j])) {
+ *val = j;
+ return true;
+ }
+ }
+ }
+ }
+
+ PL_WARN(p, "Variable '%.*s' not found in RPN expression!", PL_STR_FMT(var));
+ return false;
+}
+
+// Returns whether successful. 'result' is left untouched on failure
+static bool eval_shexpr(struct hook_ctx *ctx,
+ const struct shexp expr[MAX_SHEXP_SIZE],
+ float *result)
+{
+ struct hook_priv *p = ctx->priv;
+ float stack[MAX_SHEXP_SIZE] = {0};
+ int idx = 0; // points to next element to push
+
+ for (int i = 0; i < MAX_SHEXP_SIZE; i++) {
+ switch (expr[i].tag) {
+ case SHEXP_END:
+ goto done;
+
+ case SHEXP_CONST:
+ // Since our SHEXPs are bound by MAX_SHEXP_SIZE, it should be
+ // impossible to overflow the stack
+ assert(idx < MAX_SHEXP_SIZE);
+ stack[idx++] = expr[i].val.cval;
+ continue;
+
+ case SHEXP_OP1:
+ if (idx < 1) {
+ PL_WARN(p, "Stack underflow in RPN expression!");
+ return false;
+ }
+
+ switch (expr[i].val.op) {
+ case SHEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+ default: pl_unreachable();
+ }
+ continue;
+
+ case SHEXP_OP2:
+ if (idx < 2) {
+ PL_WARN(p, "Stack underflow in RPN expression!");
+ return false;
+ }
+
+ // Pop the operands in reverse order
+ float op2 = stack[--idx];
+ float op1 = stack[--idx];
+ float res = 0.0;
+ switch (expr[i].val.op) {
+ case SHEXP_OP_ADD: res = op1 + op2; break;
+ case SHEXP_OP_SUB: res = op1 - op2; break;
+ case SHEXP_OP_MUL: res = op1 * op2; break;
+ case SHEXP_OP_DIV: res = op1 / op2; break;
+ case SHEXP_OP_MOD: res = fmodf(op1, op2); break;
+ case SHEXP_OP_GT: res = op1 > op2; break;
+ case SHEXP_OP_LT: res = op1 < op2; break;
+ case SHEXP_OP_EQ: res = fabsf(op1 - op2) <= 1e-6 * fmaxf(op1, op2); break;
+ case SHEXP_OP_NOT: pl_unreachable();
+ }
+
+ if (!isfinite(res)) {
+ PL_WARN(p, "Illegal operation in RPN expression!");
+ return false;
+ }
+
+ stack[idx++] = res;
+ continue;
+
+ case SHEXP_TEX_W:
+ case SHEXP_TEX_H: {
+ pl_str name = expr[i].val.varname;
+ float size[2];
+
+ if (!lookup_tex(ctx, name, size)) {
+ PL_WARN(p, "Variable '%.*s' not found in RPN expression!",
+ PL_STR_FMT(name));
+ return false;
+ }
+
+ stack[idx++] = (expr[i].tag == SHEXP_TEX_W) ? size[0] : size[1];
+ continue;
+ }
+
+ case SHEXP_VAR: {
+ pl_str name = expr[i].val.varname;
+ float val;
+ if (!lookup_var(ctx, name, &val))
+ return false;
+ stack[idx++] = val;
+ continue;
+ }
+ }
+ }
+
+done:
+ // Return the single stack element
+ if (idx != 1) {
+ PL_WARN(p, "Malformed stack after RPN expression!");
+ return false;
+ }
+
+ *result = stack[0];
+ return true;
+}
+
+static double prng_step(uint64_t s[4])
+{
+ const uint64_t result = s[0] + s[3];
+ const uint64_t t = s[1] << 17;
+
+ s[2] ^= s[0];
+ s[3] ^= s[1];
+ s[1] ^= s[2];
+ s[0] ^= s[3];
+
+ s[2] ^= t;
+ s[3] = (s[3] << 45) | (s[3] >> (64 - 45));
+ return (result >> 11) * 0x1.0p-53;
+}
+
+static bool bind_pass_tex(pl_shader sh, pl_str name,
+ const struct pass_tex *ptex,
+ const pl_rect2df *rect,
+ bool hooked, bool mainpresub)
+{
+ ident_t id, pos, pt;
+
+ // Compatibility with mpv texture binding semantics
+ id = sh_bind(sh, ptex->tex, PL_TEX_ADDRESS_CLAMP, PL_TEX_SAMPLE_LINEAR,
+ "hook_tex", rect, &pos, &pt);
+ if (!id)
+ return false;
+
+ GLSLH("#define %.*s_raw "$" \n", PL_STR_FMT(name), id);
+ GLSLH("#define %.*s_pos "$" \n", PL_STR_FMT(name), pos);
+ GLSLH("#define %.*s_map "$"_map \n", PL_STR_FMT(name), pos);
+ GLSLH("#define %.*s_size vec2(textureSize("$", 0)) \n", PL_STR_FMT(name), id);
+ GLSLH("#define %.*s_pt "$" \n", PL_STR_FMT(name), pt);
+
+ float off[2] = { ptex->rect.x0, ptex->rect.y0 };
+ GLSLH("#define %.*s_off "$" \n", PL_STR_FMT(name),
+ sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("offset"),
+ .data = off,
+ }));
+
+ struct pl_color_repr repr = ptex->repr;
+ ident_t scale = SH_FLOAT(pl_color_repr_normalize(&repr));
+ GLSLH("#define %.*s_mul "$" \n", PL_STR_FMT(name), scale);
+
+ // Compatibility with mpv
+ GLSLH("#define %.*s_rot mat2(1.0, 0.0, 0.0, 1.0) \n", PL_STR_FMT(name));
+
+ // Sampling function boilerplate
+ GLSLH("#define %.*s_tex(pos) ("$" * vec4(textureLod("$", pos, 0.0))) \n",
+ PL_STR_FMT(name), scale, id);
+ GLSLH("#define %.*s_texOff(off) (%.*s_tex("$" + "$" * vec2(off))) \n",
+ PL_STR_FMT(name), PL_STR_FMT(name), pos, pt);
+
+ bool can_gather = ptex->tex->params.format->gatherable;
+ if (can_gather) {
+ GLSLH("#define %.*s_gather(pos, c) ("$" * vec4(textureGather("$", pos, c))) \n",
+ PL_STR_FMT(name), scale, id);
+ }
+
+ if (hooked) {
+ GLSLH("#define HOOKED_raw %.*s_raw \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_pos %.*s_pos \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_size %.*s_size \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_rot %.*s_rot \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_off %.*s_off \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_pt %.*s_pt \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_map %.*s_map \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_mul %.*s_mul \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_tex %.*s_tex \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_texOff %.*s_texOff \n", PL_STR_FMT(name));
+ if (can_gather)
+ GLSLH("#define HOOKED_gather %.*s_gather \n", PL_STR_FMT(name));
+ }
+
+ if (mainpresub) {
+ GLSLH("#define MAIN_raw MAINPRESUB_raw \n");
+ GLSLH("#define MAIN_pos MAINPRESUB_pos \n");
+ GLSLH("#define MAIN_size MAINPRESUB_size \n");
+ GLSLH("#define MAIN_rot MAINPRESUB_rot \n");
+ GLSLH("#define MAIN_off MAINPRESUB_off \n");
+ GLSLH("#define MAIN_pt MAINPRESUB_pt \n");
+ GLSLH("#define MAIN_map MAINPRESUB_map \n");
+ GLSLH("#define MAIN_mul MAINPRESUB_mul \n");
+ GLSLH("#define MAIN_tex MAINPRESUB_tex \n");
+ GLSLH("#define MAIN_texOff MAINPRESUB_texOff \n");
+ if (can_gather)
+ GLSLH("#define MAIN_gather MAINPRESUB_gather \n");
+ }
+
+ return true;
+}
+
+static void save_pass_tex(struct hook_priv *p, struct pass_tex ptex)
+{
+
+ for (int i = 0; i < p->pass_textures.num; i++) {
+ if (!pl_str_equals(p->pass_textures.elem[i].name, ptex.name))
+ continue;
+
+ p->pass_textures.elem[i] = ptex;
+ return;
+ }
+
+ // No texture with this name yet, append new one
+ PL_ARRAY_APPEND(p->alloc, p->pass_textures, ptex);
+}
+
+static struct pl_hook_res hook_hook(void *priv, const struct pl_hook_params *params)
+{
+ struct hook_priv *p = priv;
+ pl_str stage = pl_stage_to_mp(params->stage);
+ struct pl_hook_res res = {0};
+
+ pl_shader sh = NULL;
+ struct hook_ctx ctx = {
+ .priv = p,
+ .params = params,
+ .hooked = {
+ .name = stage,
+ .tex = params->tex,
+ .rect = params->rect,
+ .repr = params->repr,
+ .color = params->color,
+ .comps = params->components,
+ },
+ };
+
+ // Save the input texture if needed
+ if (p->save_stages & params->stage) {
+ PL_TRACE(p, "Saving input texture '%.*s' for binding",
+ PL_STR_FMT(ctx.hooked.name));
+ save_pass_tex(p, ctx.hooked);
+ }
+
+ for (int n = 0; n < p->hook_passes.num; n++) {
+ const struct hook_pass *pass = &p->hook_passes.elem[n];
+ if (!(pass->exec_stages & params->stage))
+ continue;
+
+ const struct custom_shader_hook *hook = &pass->hook;
+ PL_TRACE(p, "Executing hook pass %d on stage '%.*s': %.*s",
+ n, PL_STR_FMT(stage), PL_STR_FMT(hook->pass_desc));
+
+ // Test for execution condition
+ float run = 0;
+ if (!eval_shexpr(&ctx, hook->cond, &run))
+ goto error;
+
+ if (!run) {
+ PL_TRACE(p, "Skipping hook due to condition");
+ continue;
+ }
+
+ // Generate a new shader object
+ sh = pl_dispatch_begin(params->dispatch);
+
+ // Bind all necessary input textures
+ for (int i = 0; i < PL_ARRAY_SIZE(hook->bind_tex); i++) {
+ pl_str texname = hook->bind_tex[i];
+ if (!texname.len)
+ break;
+
+ // Convenience alias, to allow writing shaders that are oblivious
+ // of the exact stage they hooked. This simply translates to
+ // whatever stage actually fired the hook.
+ bool hooked = false, mainpresub = false;
+ if (pl_str_equals0(texname, "HOOKED")) {
+ // Continue with binding this, under the new name
+ texname = stage;
+ hooked = true;
+ }
+
+ // Compatibility alias, because MAIN and MAINPRESUB mean the same
+ // thing to libplacebo, but user shaders are still written as
+ // though they can be different concepts.
+ if (pl_str_equals0(texname, "MAIN") ||
+ pl_str_equals0(texname, "MAINPRESUB"))
+ {
+ texname = pl_str0("MAINPRESUB");
+ mainpresub = true;
+ }
+
+ for (int j = 0; j < p->descriptors.num; j++) {
+ if (pl_str_equals0(texname, p->descriptors.elem[j].desc.name)) {
+ // Directly bind this, no need to bother with all the
+ // `bind_pass_tex` boilerplate
+ ident_t id = sh_desc(sh, p->descriptors.elem[j]);
+ GLSLH("#define %.*s "$" \n", PL_STR_FMT(texname), id);
+
+ if (p->descriptors.elem[j].desc.type == PL_DESC_SAMPLED_TEX) {
+ GLSLH("#define %.*s_tex(pos) (textureLod("$", pos, 0.0)) \n",
+ PL_STR_FMT(texname), id);
+ }
+ goto next_bind;
+ }
+ }
+
+ for (int j = 0; j < p->pass_textures.num; j++) {
+ if (pl_str_equals(texname, p->pass_textures.elem[j].name)) {
+ // Note: We bind the whole texture, rather than
+ // hooked.rect, because user shaders in general are not
+ // designed to handle cropped input textures.
+ const struct pass_tex *ptex = &p->pass_textures.elem[j];
+ pl_rect2df rect = {
+ 0, 0, ptex->tex->params.w, ptex->tex->params.h,
+ };
+
+ if (hook->offset_align && pl_str_equals(texname, stage)) {
+ float sx = pl_rect_w(ctx.hooked.rect) / pl_rect_w(params->src_rect),
+ sy = pl_rect_h(ctx.hooked.rect) / pl_rect_h(params->src_rect),
+ ox = ctx.hooked.rect.x0 - sx * params->src_rect.x0,
+ oy = ctx.hooked.rect.y0 - sy * params->src_rect.y0;
+
+ PL_TRACE(p, "Aligning plane with ref: %f %f", ox, oy);
+ pl_rect2df_offset(&rect, ox, oy);
+ }
+
+ if (!bind_pass_tex(sh, texname, &p->pass_textures.elem[j],
+ &rect, hooked, mainpresub))
+ {
+ goto error;
+ }
+ goto next_bind;
+ }
+ }
+
+ // If none of the above matched, this is an unknown texture name,
+ // so silently ignore this pass to match the mpv behavior
+ PL_TRACE(p, "Skipping hook due to no texture named '%.*s'.",
+ PL_STR_FMT(texname));
+ pl_dispatch_abort(params->dispatch, &sh);
+ goto next_pass;
+
+ next_bind: ; // outer 'continue'
+ }
+
+ // Set up the input variables
+ p->frame_count++;
+ GLSLH("#define frame "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_int("frame"),
+ .data = &p->frame_count,
+ .dynamic = true,
+ }));
+
+ float random = prng_step(p->prng_state);
+ GLSLH("#define random "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("random"),
+ .data = &random,
+ .dynamic = true,
+ }));
+
+ float src_size[2] = { pl_rect_w(params->src_rect), pl_rect_h(params->src_rect) };
+ GLSLH("#define input_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("input_size"),
+ .data = src_size,
+ }));
+
+ float dst_size[2] = { pl_rect_w(params->dst_rect), pl_rect_h(params->dst_rect) };
+ GLSLH("#define target_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("target_size"),
+ .data = dst_size,
+ }));
+
+ float tex_off[2] = { params->src_rect.x0, params->src_rect.y0 };
+ GLSLH("#define tex_offset "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tex_offset"),
+ .data = tex_off,
+ }));
+
+ // Custom parameters
+ for (int i = 0; i < p->hook_params.num; i++) {
+ const struct pl_hook_par *hp = &p->hook_params.elem[i];
+ switch (hp->mode) {
+ case PL_HOOK_PAR_VARIABLE:
+ case PL_HOOK_PAR_DYNAMIC:
+ GLSLH("#define %s "$" \n", hp->name,
+ sh_var(sh, (struct pl_shader_var) {
+ .var = {
+ .name = hp->name,
+ .type = hp->type,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = 1,
+ },
+ .data = hp->data,
+ .dynamic = hp->mode == PL_HOOK_PAR_DYNAMIC,
+ }));
+ break;
+
+ case PL_HOOK_PAR_CONSTANT:
+ GLSLH("#define %s "$" \n", hp->name,
+ sh_const(sh, (struct pl_shader_const) {
+ .name = hp->name,
+ .type = hp->type,
+ .data = hp->data,
+ .compile_time = true,
+ }));
+ break;
+
+ case PL_HOOK_PAR_DEFINE:
+ GLSLH("#define %s %d \n", hp->name, hp->data->i);
+ break;
+
+ case PL_HOOK_PAR_MODE_COUNT:
+ pl_unreachable();
+ }
+
+ if (hp->names) {
+ for (int j = hp->minimum.i; j <= hp->maximum.i; j++)
+ GLSLH("#define %s %d \n", hp->names[j], j);
+ }
+ }
+
+ // Helper sub-shaders
+ uint64_t sh_id = SH_PARAMS(sh).id;
+ pl_shader_reset(p->trc_helper, pl_shader_params(
+ .id = ++sh_id,
+ .gpu = p->gpu,
+ ));
+ pl_shader_linearize(p->trc_helper, params->orig_color);
+ GLSLH("#define linearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+ pl_shader_reset(p->trc_helper, pl_shader_params(
+ .id = ++sh_id,
+ .gpu = p->gpu,
+ ));
+ pl_shader_delinearize(p->trc_helper, params->orig_color);
+ GLSLH("#define delinearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+ // Load and run the user shader itself
+ sh_append_str(sh, SH_BUF_HEADER, hook->pass_body);
+ sh_describef(sh, "%.*s", PL_STR_FMT(hook->pass_desc));
+
+ // Resolve output size and create framebuffer
+ float out_size[2] = {0};
+ if (!eval_shexpr(&ctx, hook->width, &out_size[0]) ||
+ !eval_shexpr(&ctx, hook->height, &out_size[1]))
+ {
+ goto error;
+ }
+
+ int out_w = roundf(out_size[0]),
+ out_h = roundf(out_size[1]);
+
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+ goto error;
+
+ // Generate a new texture to store the render result
+ pl_tex fbo;
+ fbo = params->get_tex(params->priv, out_w, out_h);
+ if (!fbo) {
+ PL_ERR(p, "Failed dispatching hook: `get_tex` callback failed?");
+ goto error;
+ }
+
+ bool ok;
+ if (hook->is_compute) {
+
+ if (!sh_try_compute(sh, hook->threads_w, hook->threads_h, false, 0) ||
+ !fbo->params.storable)
+ {
+ PL_ERR(p, "Failed dispatching COMPUTE shader");
+ goto error;
+ }
+
+ GLSLP("#define out_image "$" \n", sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = fbo,
+ .desc = {
+ .name = "out_image",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_WRITEONLY,
+ },
+ }));
+
+ sh->output = PL_SHADER_SIG_NONE;
+
+ GLSL("hook(); \n");
+ ok = pl_dispatch_compute(params->dispatch, pl_dispatch_compute_params(
+ .shader = &sh,
+ .dispatch_size = {
+ // Round up as many blocks as are needed to cover the image
+ PL_DIV_UP(out_w, hook->block_w),
+ PL_DIV_UP(out_h, hook->block_h),
+ 1,
+ },
+ .width = out_w,
+ .height = out_h,
+ ));
+
+ } else {
+
+ // Default non-COMPUTE shaders to explicitly use fragment shaders
+ // only, to avoid breaking things like fwidth()
+ sh->type = PL_DEF(sh->type, SH_FRAGMENT);
+
+ GLSL("vec4 color = hook(); \n");
+ ok = pl_dispatch_finish(params->dispatch, pl_dispatch_params(
+ .shader = &sh,
+ .target = fbo,
+ ));
+
+ }
+
+ if (!ok)
+ goto error;
+
+ float sx = (float) out_w / ctx.hooked.tex->params.w,
+ sy = (float) out_h / ctx.hooked.tex->params.h,
+ x0 = sx * ctx.hooked.rect.x0 + hook->offset[0],
+ y0 = sy * ctx.hooked.rect.y0 + hook->offset[1];
+
+ pl_rect2df new_rect = {
+ x0,
+ y0,
+ x0 + sx * pl_rect_w(ctx.hooked.rect),
+ y0 + sy * pl_rect_h(ctx.hooked.rect),
+ };
+
+ if (hook->offset_align) {
+ float rx = pl_rect_w(new_rect) / pl_rect_w(params->src_rect),
+ ry = pl_rect_h(new_rect) / pl_rect_h(params->src_rect),
+ ox = rx * params->src_rect.x0 - sx * ctx.hooked.rect.x0,
+ oy = ry * params->src_rect.y0 - sy * ctx.hooked.rect.y0;
+
+ pl_rect2df_offset(&new_rect, ox, oy);
+ }
+
+ // Save the result of this shader invocation
+ struct pass_tex ptex = {
+ .name = hook->save_tex.len ? hook->save_tex : stage,
+ .tex = fbo,
+ .repr = ctx.hooked.repr,
+ .color = ctx.hooked.color,
+ .comps = PL_DEF(hook->comps, ctx.hooked.comps),
+ .rect = new_rect,
+ };
+
+ // It's assumed that users will correctly normalize the input
+ pl_color_repr_normalize(&ptex.repr);
+
+ PL_TRACE(p, "Saving output texture '%.*s' from hook execution on '%.*s'",
+ PL_STR_FMT(ptex.name), PL_STR_FMT(stage));
+
+ save_pass_tex(p, ptex);
+
+ // Update the result object, unless we saved to a different name
+ if (pl_str_equals(ptex.name, stage)) {
+ ctx.hooked = ptex;
+ res = (struct pl_hook_res) {
+ .output = PL_HOOK_SIG_TEX,
+ .tex = fbo,
+ .repr = ptex.repr,
+ .color = ptex.color,
+ .components = ptex.comps,
+ .rect = new_rect,
+ };
+ }
+
+next_pass: ;
+ }
+
+ return res;
+
+error:
+ pl_dispatch_abort(params->dispatch, &sh);
+ return (struct pl_hook_res) { .failed = true };
+}
+
+const struct pl_hook *pl_mpv_user_shader_parse(pl_gpu gpu,
+ const char *shader_text,
+ size_t shader_len)
+{
+ if (!shader_len)
+ return NULL;
+
+ pl_str shader = { (uint8_t *) shader_text, shader_len };
+
+ struct pl_hook *hook = pl_zalloc_obj(NULL, hook, struct hook_priv);
+ struct hook_priv *p = PL_PRIV(hook);
+
+ *hook = (struct pl_hook) {
+ .input = PL_HOOK_SIG_TEX,
+ .priv = p,
+ .reset = hook_reset,
+ .hook = hook_hook,
+ .signature = pl_str_hash(shader),
+ };
+
+ *p = (struct hook_priv) {
+ .log = gpu->log,
+ .gpu = gpu,
+ .alloc = hook,
+ .trc_helper = pl_shader_alloc(gpu->log, NULL),
+ .prng_state = {
+ // Determined by fair die roll
+ 0xb76d71f9443c228allu, 0x93a02092fc4807e8llu,
+ 0x06d81748f838bd07llu, 0x9381ee129dddce6cllu,
+ },
+ };
+
+ shader = pl_strdup(hook, shader);
+
+ // Skip all garbage (e.g. comments) before the first header
+ int pos = pl_str_find(shader, pl_str0("//!"));
+ if (pos < 0) {
+ PL_ERR(gpu, "Shader appears to contain no headers?");
+ goto error;
+ }
+ shader = pl_str_drop(shader, pos);
+
+ // Loop over the file
+ while (shader.len > 0)
+ {
+ // Peek at the first header to dispatch the right type
+ if (pl_str_startswith0(shader, "//!TEXTURE")) {
+ struct pl_shader_desc sd;
+ if (!parse_tex(gpu, hook, &shader, &sd))
+ goto error;
+
+ PL_INFO(gpu, "Registering named texture '%s'", sd.desc.name);
+ PL_ARRAY_APPEND(hook, p->descriptors, sd);
+ continue;
+ }
+
+ if (pl_str_startswith0(shader, "//!BUFFER")) {
+ struct pl_shader_desc sd;
+ if (!parse_buf(gpu, hook, &shader, &sd))
+ goto error;
+
+ PL_INFO(gpu, "Registering named buffer '%s'", sd.desc.name);
+ PL_ARRAY_APPEND(hook, p->descriptors, sd);
+ continue;
+ }
+
+ if (pl_str_startswith0(shader, "//!PARAM")) {
+ struct pl_hook_par hp;
+ if (!parse_param(gpu->log, hook, &shader, &hp))
+ goto error;
+
+ PL_INFO(gpu, "Registering named parameter '%s'", hp.name);
+ PL_ARRAY_APPEND(hook, p->hook_params, hp);
+ continue;
+ }
+
+ struct custom_shader_hook h;
+ if (!parse_hook(gpu->log, &shader, &h))
+ goto error;
+
+ struct hook_pass pass = {
+ .exec_stages = 0,
+ .hook = h,
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(h.hook_tex); i++)
+ pass.exec_stages |= mp_stage_to_pl(h.hook_tex[i]);
+ for (int i = 0; i < PL_ARRAY_SIZE(h.bind_tex); i++) {
+ p->save_stages |= mp_stage_to_pl(h.bind_tex[i]);
+ if (pl_str_equals0(h.bind_tex[i], "HOOKED"))
+ p->save_stages |= pass.exec_stages;
+ }
+
+ // As an extra precaution, this avoids errors when trying to run
+ // conditions against planes that were never hooked. As a sole
+ // exception, OUTPUT is special because it's hard-coded to return the
+ // dst_rect even before it was hooked. (This is an apparently
+ // undocumented mpv quirk, but shaders rely on it in practice)
+ enum pl_hook_stage rpn_stages = 0;
+ for (int i = 0; i < PL_ARRAY_SIZE(h.width); i++) {
+ if (h.width[i].tag == SHEXP_TEX_W || h.width[i].tag == SHEXP_TEX_H)
+ rpn_stages |= mp_stage_to_pl(h.width[i].val.varname);
+ }
+ for (int i = 0; i < PL_ARRAY_SIZE(h.height); i++) {
+ if (h.height[i].tag == SHEXP_TEX_W || h.height[i].tag == SHEXP_TEX_H)
+ rpn_stages |= mp_stage_to_pl(h.height[i].val.varname);
+ }
+ for (int i = 0; i < PL_ARRAY_SIZE(h.cond); i++) {
+ if (h.cond[i].tag == SHEXP_TEX_W || h.cond[i].tag == SHEXP_TEX_H)
+ rpn_stages |= mp_stage_to_pl(h.cond[i].val.varname);
+ }
+
+ p->save_stages |= rpn_stages & ~PL_HOOK_OUTPUT;
+
+ PL_INFO(gpu, "Registering hook pass: %.*s", PL_STR_FMT(h.pass_desc));
+ PL_ARRAY_APPEND(hook, p->hook_passes, pass);
+ }
+
+ // We need to hook on both the exec and save stages, so that we can keep
+ // track of any textures we might need
+ hook->stages |= p->save_stages;
+ for (int i = 0; i < p->hook_passes.num; i++)
+ hook->stages |= p->hook_passes.elem[i].exec_stages;
+
+ hook->parameters = p->hook_params.elem;
+ hook->num_parameters = p->hook_params.num;
+
+ PL_MSG(gpu, PL_LOG_DEBUG, "Loaded user shader:");
+ pl_msg_source(gpu->log, PL_LOG_DEBUG, shader_text);
+
+ return hook;
+
+error:
+ pl_mpv_user_shader_destroy((const struct pl_hook **) &hook);
+ PL_MSG(gpu, PL_LOG_ERR, "Failed to parse user shader:");
+ pl_msg_source(gpu->log, PL_LOG_ERR, shader_text);
+ pl_log_stack_trace(gpu->log, PL_LOG_ERR);
+ return NULL;
+}
+
+void pl_mpv_user_shader_destroy(const struct pl_hook **hookp)
+{
+ const struct pl_hook *hook = *hookp;
+ if (!hook)
+ return;
+
+ struct hook_priv *p = PL_PRIV(hook);
+ for (int i = 0; i < p->descriptors.num; i++) {
+ switch (p->descriptors.elem[i].desc.type) {
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE:
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE: {
+ pl_buf buf = p->descriptors.elem[i].binding.object;
+ pl_buf_destroy(p->gpu, &buf);
+ break;
+ }
+
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = p->descriptors.elem[i].binding.object;
+ pl_tex_destroy(p->gpu, &tex);
+ break;
+
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+ }
+
+ pl_shader_free(&p->trc_helper);
+ pl_free((void *) hook);
+ *hookp = NULL;
+}
diff --git a/src/shaders/deinterlacing.c b/src/shaders/deinterlacing.c
new file mode 100644
index 0000000..5c85138
--- /dev/null
+++ b/src/shaders/deinterlacing.c
@@ -0,0 +1,260 @@
+/*
+ * This file is part of libplacebo, but also based on vf_yadif_cuda.cu:
+ * Copyright (C) 2018 Philip Langdale <philipl@overt.org>
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/deinterlacing.h>
+
+const struct pl_deinterlace_params pl_deinterlace_default_params = { PL_DEINTERLACE_DEFAULTS };
+
+void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src,
+ const struct pl_deinterlace_params *params)
+{
+ params = PL_DEF(params, &pl_deinterlace_default_params);
+
+ const struct pl_tex_params *texparams = &src->cur.top->params;
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, texparams->w, texparams->h))
+ return;
+
+ sh_describe(sh, "deinterlacing");
+ GLSL("vec4 color = vec4(0,0,0,1); \n"
+ "// pl_shader_deinterlace \n"
+ "{ \n");
+
+ uint8_t comp_mask = PL_DEF(src->component_mask, 0xFu);
+ comp_mask &= (1u << texparams->format->num_components) - 1u;
+ if (!comp_mask) {
+ SH_FAIL(sh, "pl_shader_deinterlace: empty component mask?");
+ return;
+ }
+
+ const uint8_t num_comps = sh_num_comps(comp_mask);
+ const char *swiz = sh_swizzle(comp_mask);
+ GLSL("#define T %s \n", sh_float_type(comp_mask));
+
+ ident_t pos, pt;
+ ident_t cur = sh_bind(sh, src->cur.top, PL_TEX_ADDRESS_MIRROR,
+ PL_TEX_SAMPLE_NEAREST, "cur", NULL, &pos, &pt);
+ if (!cur)
+ return;
+
+ GLSL("#define GET(TEX, X, Y) \\\n"
+ " (textureLod(TEX, pos + pt * vec2(X, Y), 0.0).%s) \n"
+ "vec2 pos = "$"; \n"
+ "vec2 pt = "$"; \n"
+ "T res; \n",
+ swiz, pos, pt);
+
+ if (src->field == PL_FIELD_NONE) {
+ GLSL("res = GET("$", 0, 0); \n", cur);
+ goto done;
+ }
+
+ // Don't modify the primary field
+ GLSL("int yh = textureSize("$", 0).y; \n"
+ "int yo = int("$".y * float(yh)); \n"
+ "if (yo %% 2 == %d) { \n"
+ " res = GET("$", 0, 0); \n"
+ "} else { \n",
+ cur, pos,
+ src->field == PL_FIELD_TOP ? 0 : 1,
+ cur);
+
+ switch (params->algo) {
+ case PL_DEINTERLACE_WEAVE:
+ GLSL("res = GET("$", 0, 0); \n", cur);
+ break;
+
+ case PL_DEINTERLACE_BOB:
+ GLSL("res = GET("$", 0, %d); \n", cur,
+ src->field == PL_FIELD_TOP ? -1 : 1);
+ break;
+
+
+ case PL_DEINTERLACE_YADIF: {
+ // Try using a compute shader for this, for the sole reason of
+ // optimizing for thread group synchronicity. Otherwise, because we
+ // alternate between lines output as-is and lines output deinterlaced,
+ // half of our thread group will be mostly idle at any point in time.
+ const int bw = PL_DEF(sh_glsl(sh).subgroup_size, 32);
+ sh_try_compute(sh, bw, 1, true, 0);
+
+ // This magic constant is hard-coded in the original implementation as
+ // '1' on an 8-bit scale. Since we work with arbitrary bit depth
+ // floating point textures, we have to convert this somehow. Hard-code
+ // it as 1/255 under the assumption that the original intent was to be
+ // roughly 1 unit of brightness increment on an 8-bit source. This may
+ // or may not produce suboptimal results on higher-bit-depth content.
+ static const float spatial_bias = 1 / 255.0f;
+
+ // Calculate spatial prediction
+ ident_t spatial_pred = sh_fresh(sh, "spatial_predictor");
+ GLSLH("float "$"(float a, float b, float c, float d, float e, float f, float g, \n"
+ " float h, float i, float j, float k, float l, float m, float n) \n"
+ "{ \n"
+ " float spatial_pred = (d + k) / 2.0; \n"
+ " float spatial_score = abs(c - j) + abs(d - k) + abs(e - l) - %f; \n"
+
+ " float score = abs(b - k) + abs(c - l) + abs(d - m); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (c + l) / 2.0; \n"
+ " spatial_score = score; \n"
+ " score = abs(a - l) + abs(b - m) + abs(c - n); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (b + m) / 2.0; \n"
+ " spatial_score = score; \n"
+ " } \n"
+ " } \n"
+ " score = abs(d - i) + abs(e - j) + abs(f - k); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (e + j) / 2.0; \n"
+ " spatial_score = score; \n"
+ " score = abs(e - h) + abs(f - i) + abs(g - j); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (f + i) / 2.0; \n"
+ " spatial_score = score; \n"
+ " } \n"
+ " } \n"
+ " return spatial_pred; \n"
+ "} \n",
+ spatial_pred, spatial_bias);
+
+ GLSL("T a = GET("$", -3, -1); \n"
+ "T b = GET("$", -2, -1); \n"
+ "T c = GET("$", -1, -1); \n"
+ "T d = GET("$", 0, -1); \n"
+ "T e = GET("$", +1, -1); \n"
+ "T f = GET("$", +2, -1); \n"
+ "T g = GET("$", +3, -1); \n"
+ "T h = GET("$", -3, +1); \n"
+ "T i = GET("$", -2, +1); \n"
+ "T j = GET("$", -1, +1); \n"
+ "T k = GET("$", 0, +1); \n"
+ "T l = GET("$", +1, +1); \n"
+ "T m = GET("$", +2, +1); \n"
+ "T n = GET("$", +3, +1); \n",
+ cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur);
+
+ if (num_comps == 1) {
+ GLSL("res = "$"(a, b, c, d, e, f, g, h, i, j, k, l, m, n); \n", spatial_pred);
+ } else {
+ for (uint8_t i = 0; i < num_comps; i++) {
+ char c = "xyzw"[i];
+ GLSL("res.%c = "$"(a.%c, b.%c, c.%c, d.%c, e.%c, f.%c, g.%c, \n"
+ " h.%c, i.%c, j.%c, k.%c, l.%c, m.%c, n.%c); \n",
+ c, spatial_pred, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
+ }
+ }
+
+ // Calculate temporal prediction
+ ident_t temporal_pred = sh_fresh(sh, "temporal_predictor");
+ GLSLH("float "$"(float A, float B, float C, float D, float E, float F, \n"
+ " float G, float H, float I, float J, float K, float L, \n"
+ " float spatial_pred) \n"
+ "{ \n"
+ " float p0 = (C + H) / 2.0; \n"
+ " float p1 = F; \n"
+ " float p2 = (D + I) / 2.0; \n"
+ " float p3 = G; \n"
+ " float p4 = (E + J) / 2.0; \n"
+
+ " float tdiff0 = abs(D - I) / 2.0; \n"
+ " float tdiff1 = (abs(A - F) + abs(B - G)) / 2.0; \n"
+ " float tdiff2 = (abs(K - F) + abs(G - L)) / 2.0; \n"
+ " float diff = max(tdiff0, max(tdiff1, tdiff2)); \n",
+ temporal_pred);
+ if (!params->skip_spatial_check) {
+ GLSLH("float maxi = max(p2 - min(p3, p1), min(p0 - p1, p4 - p3)); \n"
+ "float mini = min(p2 - max(p3, p1), max(p0 - p1, p4 - p3)); \n"
+ "diff = max(diff, max(mini, -maxi)); \n");
+ }
+ GLSLH(" if (spatial_pred > p2 + diff) \n"
+ " spatial_pred = p2 + diff; \n"
+ " if (spatial_pred < p2 - diff) \n"
+ " spatial_pred = p2 - diff; \n"
+ " return spatial_pred; \n"
+ "} \n");
+
+ ident_t prev2 = cur, next2 = cur;
+ if (src->prev.top && src->prev.top != src->cur.top) {
+ pl_assert(src->prev.top->params.w == texparams->w);
+ pl_assert(src->prev.top->params.h == texparams->h);
+ prev2 = sh_bind(sh, src->prev.top, PL_TEX_ADDRESS_MIRROR,
+ PL_TEX_SAMPLE_NEAREST, "prev", NULL, NULL, NULL);
+ if (!prev2)
+ return;
+ }
+
+ if (src->next.top && src->next.top != src->cur.top) {
+ pl_assert(src->next.top->params.w == texparams->w);
+ pl_assert(src->next.top->params.h == texparams->h);
+ next2 = sh_bind(sh, src->next.top, PL_TEX_ADDRESS_MIRROR,
+ PL_TEX_SAMPLE_NEAREST, "next", NULL, NULL, NULL);
+ if (!next2)
+ return;
+ }
+
+ enum pl_field first_field = PL_DEF(src->first_field, PL_FIELD_TOP);
+ ident_t prev1 = src->field == first_field ? prev2 : cur;
+ ident_t next1 = src->field == first_field ? cur : next2;
+
+ GLSL("T A = GET("$", 0, -1); \n"
+ "T B = GET("$", 0, 1); \n"
+ "T C = GET("$", 0, -2); \n"
+ "T D = GET("$", 0, 0); \n"
+ "T E = GET("$", 0, +2); \n"
+ "T F = GET("$", 0, -1); \n"
+ "T G = GET("$", 0, +1); \n"
+ "T H = GET("$", 0, -2); \n"
+ "T I = GET("$", 0, 0); \n"
+ "T J = GET("$", 0, +2); \n"
+ "T K = GET("$", 0, -1); \n"
+ "T L = GET("$", 0, +1); \n",
+ prev2, prev2,
+ prev1, prev1, prev1,
+ cur, cur,
+ next1, next1, next1,
+ next2, next2);
+
+ if (num_comps == 1) {
+ GLSL("res = "$"(A, B, C, D, E, F, G, H, I, J, K, L, res); \n", temporal_pred);
+ } else {
+ for (uint8_t i = 0; i < num_comps; i++) {
+ char c = "xyzw"[i];
+ GLSL("res.%c = "$"(A.%c, B.%c, C.%c, D.%c, E.%c, F.%c, \n"
+ " G.%c, H.%c, I.%c, J.%c, K.%c, L.%c, \n"
+ " res.%c); \n",
+ c, temporal_pred, c, c, c, c, c, c, c, c, c, c, c, c, c);
+ }
+ }
+ break;
+ }
+
+ case PL_DEINTERLACE_ALGORITHM_COUNT:
+ pl_unreachable();
+ }
+
+ GLSL("}\n"); // End of primary/secondary field branch
+
+done:
+ GLSL("color.%s = res; \n"
+ "#undef T \n"
+ "#undef GET \n"
+ "} \n",
+ swiz);
+}
diff --git a/src/shaders/dithering.c b/src/shaders/dithering.c
new file mode 100644
index 0000000..4485d11
--- /dev/null
+++ b/src/shaders/dithering.c
@@ -0,0 +1,527 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/shaders/dithering.h>
+
+const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS };
+
+struct sh_dither_obj {
+ pl_shader_obj lut;
+};
+
+static void sh_dither_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_dither_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->lut);
+ *obj = (struct sh_dither_obj) {0};
+}
+
+static void fill_dither_matrix(void *data, const struct sh_lut_params *params)
+{
+ pl_assert(params->width > 0 && params->height > 0 && params->comps == 1);
+
+ const struct pl_dither_params *dpar = params->priv;
+ switch (dpar->method) {
+ case PL_DITHER_ORDERED_LUT:
+ pl_assert(params->width == params->height);
+ pl_generate_bayer_matrix(data, params->width);
+ return;
+
+ case PL_DITHER_BLUE_NOISE:
+ pl_assert(params->width == params->height);
+ pl_generate_blue_noise(data, params->width);
+ return;
+
+ case PL_DITHER_ORDERED_FIXED:
+ case PL_DITHER_WHITE_NOISE:
+ case PL_DITHER_METHOD_COUNT:
+ return;
+ }
+
+ pl_unreachable();
+}
+
+static bool dither_method_is_lut(enum pl_dither_method method)
+{
+ switch (method) {
+ case PL_DITHER_BLUE_NOISE:
+ case PL_DITHER_ORDERED_LUT:
+ return true;
+ case PL_DITHER_ORDERED_FIXED:
+ case PL_DITHER_WHITE_NOISE:
+ return false;
+ case PL_DITHER_METHOD_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static inline float approx_gamma(enum pl_color_transfer trc)
+{
+ switch (trc) {
+ case PL_COLOR_TRC_UNKNOWN: return 1.0f;
+ case PL_COLOR_TRC_LINEAR: return 1.0f;
+ case PL_COLOR_TRC_PRO_PHOTO:return 1.8f;
+ case PL_COLOR_TRC_GAMMA18: return 1.8f;
+ case PL_COLOR_TRC_GAMMA20: return 2.0f;
+ case PL_COLOR_TRC_GAMMA24: return 2.4f;
+ case PL_COLOR_TRC_GAMMA26: return 2.6f;
+ case PL_COLOR_TRC_ST428: return 2.6f;
+ case PL_COLOR_TRC_GAMMA28: return 2.8f;
+
+ case PL_COLOR_TRC_SRGB:
+ case PL_COLOR_TRC_BT_1886:
+ case PL_COLOR_TRC_GAMMA22:
+ return 2.2f;
+
+ case PL_COLOR_TRC_PQ:
+ case PL_COLOR_TRC_HLG:
+ case PL_COLOR_TRC_V_LOG:
+ case PL_COLOR_TRC_S_LOG1:
+ case PL_COLOR_TRC_S_LOG2:
+ return 2.0f; // TODO: handle this better
+
+ case PL_COLOR_TRC_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+void pl_shader_dither(pl_shader sh, int new_depth,
+ pl_shader_obj *dither_state,
+ const struct pl_dither_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ if (new_depth <= 0 || new_depth > 256) {
+ PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth);
+ return;
+ }
+
+ sh_describef(sh, "dithering (%d bits)", new_depth);
+ GLSL("// pl_shader_dither \n"
+ "{ \n"
+ "float bias; \n");
+
+ params = PL_DEF(params, &pl_dither_default_params);
+ if (params->lut_size < 0 || params->lut_size > 8) {
+ SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size);
+ return;
+ }
+
+ enum pl_dither_method method = params->method;
+ ident_t lut = NULL_IDENT;
+ int lut_size = 0;
+
+ if (dither_method_is_lut(method)) {
+ if (!dither_state) {
+ PL_WARN(sh, "LUT-based dither method specified but no dither state "
+ "object given, falling back to non-LUT based methods.");
+ goto fallback;
+ }
+
+ struct sh_dither_obj *obj;
+ obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER,
+ struct sh_dither_obj, sh_dither_uninit);
+ if (!obj)
+ goto fallback;
+
+ bool cache = method == PL_DITHER_BLUE_NOISE;
+ lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size);
+ lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .var_type = PL_VAR_FLOAT,
+ .width = lut_size,
+ .height = lut_size,
+ .comps = 1,
+ .fill = fill_dither_matrix,
+ .signature = (CACHE_KEY_DITHER ^ method) * lut_size,
+ .cache = cache ? SH_CACHE(sh) : NULL,
+ .priv = (void *) params,
+ ));
+ if (!lut)
+ goto fallback;
+ }
+
+ goto done;
+
+fallback:
+ method = PL_DITHER_ORDERED_FIXED;
+ // fall through
+
+done: ;
+
+ int size = 0;
+ if (lut) {
+ size = lut_size;
+ } else if (method == PL_DITHER_ORDERED_FIXED) {
+ size = 16; // hard-coded size
+ }
+
+ if (size) {
+ // Transform the screen position to the cyclic range [0,1)
+ GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size));
+
+ if (params->temporal) {
+ int phase = SH_PARAMS(sh).index % 8;
+ float r = phase * (M_PI / 2); // rotate
+ float m = phase < 4 ? 1 : -1; // mirror
+ float mat[2][2] = {
+ {cos(r), -sin(r) },
+ {sin(r) * m, cos(r) * m},
+ };
+
+ ident_t rot = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat2("dither_rot"),
+ .data = &mat[0][0],
+ .dynamic = true,
+ });
+ GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot);
+ }
+ }
+
+ switch (method) {
+ case PL_DITHER_WHITE_NOISE: {
+ ident_t prng = sh_prng(sh, params->temporal, NULL);
+ GLSL("bias = "$".x;\n", prng);
+ break;
+ }
+
+ case PL_DITHER_ORDERED_FIXED:
+ // Bitwise ordered dither using only 32-bit uints
+ GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u; \n"
+ // Bitwise merge (morton number)
+ "xy.x = xy.x ^ xy.y; \n"
+ "xy = (xy | xy << 2) & uvec2(0x33333333); \n"
+ "xy = (xy | xy << 1) & uvec2(0x55555555); \n"
+ // Bitwise inversion
+ "uint b = xy.x + (xy.y << 1); \n"
+ "b = (b * 0x0802u & 0x22110u) | \n"
+ " (b * 0x8020u & 0x88440u); \n"
+ "b = 0x10101u * b; \n"
+ "b = (b >> 16) & 0xFFu; \n"
+ // Generate bias value
+ "bias = float(b) * 1.0/256.0; \n");
+ break;
+
+ case PL_DITHER_BLUE_NOISE:
+ case PL_DITHER_ORDERED_LUT:
+ pl_assert(lut);
+ GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size));
+ break;
+
+ case PL_DITHER_METHOD_COUNT:
+ pl_unreachable();
+ }
+
+ // Scale factor for dither rounding
+ GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1);
+
+ const float gamma = approx_gamma(params->transfer);
+ if (gamma != 1.0f && new_depth <= 4) {
+ GLSL("const float gamma = "$"; \n"
+ "vec4 color_lin = pow(color, vec4(gamma)); \n",
+ SH_FLOAT(gamma));
+
+ if (new_depth == 1) {
+ // Special case for bit depth 1 dithering, in this case we can just
+ // ignore the low/high rounding because we know we are always
+ // dithering between 0.0 and 1.0.
+ GLSL("const vec4 low = vec4(0.0); \n"
+ "const vec4 high = vec4(1.0); \n"
+ "vec4 offset = color_lin; \n");
+ } else {
+ // Linearize the low, high and current color values
+ GLSL("vec4 low = floor(color * scale) / scale; \n"
+ "vec4 high = ceil(color * scale) / scale; \n"
+ "vec4 low_lin = pow(low, vec4(gamma)); \n"
+ "vec4 high_lin = pow(high, vec4(gamma)); \n"
+ "vec4 range = high_lin - low_lin; \n"
+ "vec4 offset = (color_lin - low_lin) / \n"
+ " max(range, 1e-6); \n");
+ }
+
+ // Mix in the correct ratio corresponding to the offset and bias
+ GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n");
+ } else {
+ // Approximate each gamma segment as a straight line, this simplifies
+ // the process of dithering down to a single scale and (biased) round.
+ GLSL("color = scale * color + vec4(bias); \n"
+ "color = floor(color) * (1.0 / scale); \n");
+ }
+
+ GLSL("} \n");
+}
+
+/* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
+// will be affected by the current column.
+static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k)
+{
+ int ret = 0;
+ for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+ for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+ if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) {
+ int shifted_x = x + y * k->shift;
+
+ // The shift mapping guarantees current column (or left of it)
+ // won't be affected by error diffusion.
+ assert(shifted_x > 0);
+
+ ret = PL_MAX(ret, shifted_x);
+ }
+ }
+ }
+ return ret;
+}
+
+size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel,
+ int height)
+{
+ // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors
+ // propagated out from bottom side.
+ int rows = height + PL_EDF_MAX_DY;
+ int shifted_columns = compute_rightmost_shifted_column(kernel) + 1;
+
+ // The shared memory is an array of size rows*shifted_columns. Each element
+ // is a single uint for three RGB component.
+ return rows * shifted_columns * sizeof(uint32_t);
+}
+
+bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params)
+{
+ const int width = params->input_tex->params.w, height = params->input_tex->params.h;
+ const struct pl_glsl_version glsl = sh_glsl(sh);
+ const struct pl_error_diffusion_kernel *kernel =
+ PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite);
+
+ pl_assert(params->output_tex->params.w == width);
+ pl_assert(params->output_tex->params.h == height);
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height))
+ return false;
+
+ if (params->new_depth <= 0 || params->new_depth > 256) {
+ PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth);
+ return false;
+ }
+
+ // The parallel error diffusion works by applying the shift mapping first.
+ // Taking the Floyd and Steinberg algorithm for example. After applying
+ // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
+ // propagated into the next few columns, which makes parallel processing on
+ // the same column possible.
+ //
+ // X 7/16 X 7/16
+ // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16
+
+ // Figuring out the size of rectangle containing all shifted pixels.
+ // The rectangle height is not changed.
+ int shifted_width = width + (height - 1) * kernel->shift;
+
+ // We process all pixels from the shifted rectangles column by column, with
+ // a single global work group of size |block_size|.
+ // Figuring out how many block are required to process all pixels. We need
+ // this explicitly to make the number of barrier() calls match.
+ int block_size = PL_MIN(glsl.max_group_threads, height);
+ int blocks = PL_DIV_UP(height * shifted_width, block_size);
+
+ // If we figure out how many of the next columns will be affected while the
+ // current columns is being processed. We can store errors of only a few
+ // columns in the shared memory. Using a ring buffer will further save the
+ // cost while iterating to next column.
+ //
+ int ring_buffer_rows = height + PL_EDF_MAX_DY;
+ int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1;
+ ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_UINT,
+ .name = "ring_buffer_size",
+ .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns },
+ .compile_time = true,
+ });
+
+ // Compute shared memory requirements and try enabling compute shader.
+ size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t);
+ if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) {
+ PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or "
+ "insufficient compute shader memory!");
+ return false;
+ }
+
+ ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->input_tex,
+ .desc = {
+ .name = "input_tex",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ ident_t out_img = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->output_tex,
+ .desc = {
+ .name = "output_tex",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_WRITEONLY,
+ },
+ });
+
+ sh->output = PL_SHADER_SIG_NONE;
+ sh_describef(sh, "error diffusion (%s, %d bits)",
+ kernel->name, params->new_depth);
+
+ // Defines the ring buffer in shared memory.
+ GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size);
+ GLSL("// pl_shader_error_diffusion \n"
+ // Safeguard against accidental over-execution
+ "if (gl_WorkGroupID != uvec3(0)) \n"
+ " return; \n"
+ // Initialize the ring buffer.
+ "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n"
+ " err_rgb8[i] = 0u; \n"
+
+ // Main block loop, add barrier here to have previous block all
+ // processed before starting the processing of the next.
+ "for (uint block_id = 0; block_id < "$"; block_id++) { \n"
+ "barrier(); \n"
+ // Compute the coordinate of the pixel we are currently processing,
+ // both before and after the shift mapping.
+ "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex; \n"
+ "const uint height = "$"; \n"
+ "int y = int(id %% height), x_shifted = int(id / height); \n"
+ "int x = x_shifted - y * %d; \n"
+ // Proceed only if we are processing a valid pixel.
+ "if (x >= 0 && x < "$") { \n"
+ // The index that the current pixel have on the ring buffer.
+ "uint idx = uint(x_shifted * "$" + y) %% "$"; \n"
+ // Fetch the current pixel.
+ "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0); \n"
+ "vec3 pix = pix_orig.rgb; \n",
+ ring_buffer_size,
+ SH_UINT(blocks),
+ SH_UINT(height),
+ kernel->shift,
+ SH_INT(width),
+ SH_INT(ring_buffer_rows),
+ ring_buffer_size,
+ in_tex);
+
+ // The dithering will quantize pixel value into multiples of 1/dither_quant.
+ int dither_quant = (1 << params->new_depth) - 1;
+
+ // We encode errors in RGB components into a single 32-bit unsigned integer.
+ // The error we propagate from the current pixel is in range of
+ // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the
+ // sum of all errors been propagated into a pixel is also in the same range.
+ // It's possible to map errors in this range into [-127, 127], and use an
+ // unsigned 8-bit integer to store it (using standard two's complement).
+ // The three 8-bit unsigned integers can then be encoded into a single
+ // 32-bit unsigned integer, with two 4-bit padding to prevent addition
+ // operation overflows affecting other component. There are at most 12
+ // addition operations on each pixel, so 4-bit padding should be enough.
+ // The overflow from R component will be discarded.
+ //
+ // The following figure is how the encoding looks like.
+ //
+ // +------------------------------------+
+ // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB|
+ // +------------------------------------+
+ //
+
+ // The bitshift position for R and G component.
+ const int bitshift_r = 24, bitshift_g = 12;
+ // The multiplier we use to map [-0.5, 0.5] to [-127, 127].
+ const int uint8_mul = 127 * 2;
+
+ GLSL(// Add the error previously propagated into current pixel, and clear
+ // it in the ring buffer.
+ "uint err_u32 = err_rgb8[idx] + %uu; \n"
+ "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128, \n"
+ " int((err_u32 >> %d) & 0xFFu) - 128, \n"
+ " int( err_u32 & 0xFFu) - 128) / %d.0; \n"
+ "err_rgb8[idx] = 0u; \n"
+ // Write the dithered pixel.
+ "vec3 dithered = round(pix); \n"
+ "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a)); \n"
+ // Prepare for error propagation pass
+ "vec3 err_divided = (pix - dithered) * %d.0 / %d.0; \n"
+ "ivec3 tmp; \n",
+ (128u << bitshift_r) | (128u << bitshift_g) | 128u,
+ dither_quant, bitshift_r, bitshift_g, uint8_mul,
+ out_img, dither_quant,
+ uint8_mul, kernel->divisor);
+
+ // Group error propagation with same weight factor together, in order to
+ // reduce the number of annoying error encoding.
+ for (int dividend = 1; dividend <= kernel->divisor; dividend++) {
+ bool err_assigned = false;
+
+ for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+ for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+ if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend)
+ continue;
+
+ if (!err_assigned) {
+ err_assigned = true;
+
+ GLSL("tmp = ivec3(round(err_divided * %d.0)); \n"
+ "err_u32 = (uint(tmp.r & 0xFF) << %d) | \n"
+ " (uint(tmp.g & 0xFF) << %d) | \n"
+ " uint(tmp.b & 0xFF); \n",
+ dividend,
+ bitshift_r, bitshift_g);
+ }
+
+ int shifted_x = x + y * kernel->shift;
+
+ // Unlike the right border, errors propagated out from left
+ // border will remain in the ring buffer. This will produce
+ // visible artifacts near the left border, especially for
+ // shift=3 kernels.
+ if (x < 0)
+ GLSL("if (x >= %d) \n", -x);
+
+ // Calculate the new position in the ring buffer to propagate
+ // the error into.
+ int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
+ GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n",
+ ring_buffer_delta, ring_buffer_size);
+ }
+ }
+ }
+
+ GLSL("}} \n"); // end of main loop + valid pixel conditional
+ return true;
+}
diff --git a/src/shaders/film_grain.c b/src/shaders/film_grain.c
new file mode 100644
index 0000000..b1d25ff
--- /dev/null
+++ b/src/shaders/film_grain.c
@@ -0,0 +1,65 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+bool pl_needs_film_grain(const struct pl_film_grain_params *params)
+{
+ switch (params->data.type) {
+ case PL_FILM_GRAIN_NONE: return false;
+ case PL_FILM_GRAIN_AV1: return pl_needs_fg_av1(params);
+ case PL_FILM_GRAIN_H274: return pl_needs_fg_h274(params);
+ default: pl_unreachable();
+ }
+}
+
+struct sh_grain_obj {
+ pl_shader_obj av1;
+ pl_shader_obj h274;
+};
+
+static void sh_grain_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_grain_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->av1);
+ pl_shader_obj_destroy(&obj->h274);
+}
+
+bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state,
+ const struct pl_film_grain_params *params)
+{
+ if (!pl_needs_film_grain(params)) {
+ // FIXME: Instead of erroring, sample directly
+ SH_FAIL(sh, "pl_shader_film_grain called but no film grain needs to be "
+ "applied, test with `pl_needs_film_grain` first!");
+ return false;
+ }
+
+ struct sh_grain_obj *obj;
+ obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_FILM_GRAIN,
+ struct sh_grain_obj, sh_grain_uninit);
+ if (!obj)
+ return false;
+
+ switch (params->data.type) {
+ case PL_FILM_GRAIN_NONE: return false;
+ case PL_FILM_GRAIN_AV1: return pl_shader_fg_av1(sh, &obj->av1, params);
+ case PL_FILM_GRAIN_H274: return pl_shader_fg_h274(sh, &obj->h274, params);
+ default: pl_unreachable();
+ }
+}
diff --git a/src/shaders/film_grain.h b/src/shaders/film_grain.h
new file mode 100644
index 0000000..f6498c1
--- /dev/null
+++ b/src/shaders/film_grain.h
@@ -0,0 +1,75 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/shaders/film_grain.h>
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *);
+bool pl_needs_fg_h274(const struct pl_film_grain_params *);
+
+bool pl_shader_fg_av1(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+bool pl_shader_fg_h274(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+
+// Common helper function
+static inline enum pl_channel channel_map(int i, const struct pl_film_grain_params *params)
+{
+ static const enum pl_channel map_rgb[3] = {
+ [PL_CHANNEL_G] = PL_CHANNEL_Y,
+ [PL_CHANNEL_B] = PL_CHANNEL_CB,
+ [PL_CHANNEL_R] = PL_CHANNEL_CR,
+ };
+
+ static const enum pl_channel map_xyz[3] = {
+ [1] = PL_CHANNEL_Y, // Y
+ [2] = PL_CHANNEL_CB, // Z
+ [0] = PL_CHANNEL_CR, // X
+ };
+
+ if (i >= params->components)
+ return PL_CHANNEL_NONE;
+
+ int comp = params->component_mapping[i];
+ if (comp < 0 || comp > 2)
+ return PL_CHANNEL_NONE;
+
+ switch (params->repr->sys) {
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ return map_rgb[comp];
+ case PL_COLOR_SYSTEM_XYZ:
+ return map_xyz[comp];
+
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ case PL_COLOR_SYSTEM_BT_2100_PQ:
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ case PL_COLOR_SYSTEM_YCGCO:
+ return comp;
+
+ case PL_COLOR_SYSTEM_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
diff --git a/src/shaders/film_grain_av1.c b/src/shaders/film_grain_av1.c
new file mode 100644
index 0000000..3b11ea3
--- /dev/null
+++ b/src/shaders/film_grain_av1.c
@@ -0,0 +1,1001 @@
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain_av1.c) is also available
+ * under the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+static const int16_t gaussian_sequence[2048] = {
+ 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
+ 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
+ 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
+ -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
+ 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
+ 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
+ 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
+ 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
+ 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
+ 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
+ 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
+ -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
+ 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
+ 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
+ -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
+ -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
+ -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
+ -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
+ 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
+ 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
+ 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
+ -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
+ -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
+ -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
+ 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
+ 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
+ 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
+ -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
+ 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
+ -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
+ 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
+ -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
+ 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
+ -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
+ -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
+ -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
+ -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
+ -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
+ 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
+ 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
+ -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
+ -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
+ 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
+ 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
+ -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
+ 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
+ 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
+ -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
+ 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
+ -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
+ 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
+ -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
+ -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
+ 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
+ -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
+ -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
+ 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
+ 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
+ -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
+ 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
+ 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
+ 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
+ -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
+ -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
+ -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
+ 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
+ -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
+ -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
+ -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
+ -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
+ -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
+ 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
+ -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
+ -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
+ 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
+ -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
+ -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
+ -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
+ 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
+ -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
+ 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
+ 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
+ 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
+ -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
+ -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
+ 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
+ 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
+ -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
+ -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
+ -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
+ -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
+ 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
+ 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
+ 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
+ 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
+ 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
+ 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
+ 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
+ -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
+ 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
+ -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
+ -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
+ -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
+ 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
+ -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
+ -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
+ 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
+ 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
+ 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
+ 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
+ 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
+ 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
+ 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
+ -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
+ -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
+ -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
+ 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
+ -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
+ -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
+ 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
+ -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
+ 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
+ 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
+ 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
+ -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
+ 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
+ -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
+ 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
+ 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
+ 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
+ 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
+ -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
+ -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
+ 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
+ -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
+ 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
+ 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
+ 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
+ -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
+ -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
+ 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
+ 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
+ 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
+ -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
+ -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
+ 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
+ -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
+ -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
+ -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
+ 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
+ -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
+ 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
+ -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
+ 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
+ -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
+ 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
+ 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
+ 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
+ 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
+ -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
+ -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
+ -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
+ -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
+ 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
+ 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
+ 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
+ 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
+ -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
+ 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
+ -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
+ 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
+ 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
+ -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
+ -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
+ -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
+ -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
+ 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
+ -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
+ -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
+ -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
+ -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
+ 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
+ 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
+ -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
+ -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
+ 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
+ 428, -484
+};
+
+static inline int get_random_number(int bits, uint16_t *state)
+{
+ int r = *state;
+ uint16_t bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+ *state = (r >> 1) | (bit << 15);
+
+ return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(int x, int shift)
+{
+ if (!shift)
+ return x;
+
+ return (x + (1 << (shift - 1))) >> shift;
+}
+
+enum {
+ BLOCK_SIZE = 32,
+ SCALING_LUT_SIZE = 256,
+
+ GRAIN_WIDTH = 82,
+ GRAIN_HEIGHT = 73,
+ // On the GPU we only need a subsection of this
+ GRAIN_WIDTH_LUT = 64,
+ GRAIN_HEIGHT_LUT = 64,
+ GRAIN_PAD_LUT = 9,
+
+ // For subsampled grain textures
+ SUB_GRAIN_WIDTH = 44,
+ SUB_GRAIN_HEIGHT = 38,
+ SUB_GRAIN_WIDTH_LUT = GRAIN_WIDTH_LUT >> 1,
+ SUB_GRAIN_HEIGHT_LUT = GRAIN_HEIGHT_LUT >> 1,
+ SUB_GRAIN_PAD_LUT = 6,
+};
+
+// Contains the shift by which the offsets are indexed
+enum offset {
+ OFFSET_TL = 24,
+ OFFSET_T = 16,
+ OFFSET_L = 8,
+ OFFSET_N = 0,
+};
+
+// Helper function to compute some common constants
+struct grain_scale {
+ int grain_center;
+ int grain_min;
+ int grain_max;
+ float texture_scale;
+ float grain_scale;
+};
+
+static inline int bit_depth(const struct pl_color_repr *repr)
+{
+ int depth = PL_DEF(repr->bits.color_depth,
+ PL_DEF(repr->bits.sample_depth, 8));
+ pl_assert(depth >= 8);
+ return PL_MIN(depth, 12);
+}
+
+static struct grain_scale get_grain_scale(const struct pl_film_grain_params *params)
+{
+ int bits = bit_depth(params->repr);
+ struct grain_scale ret = {
+ .grain_center = 128 << (bits - 8),
+ };
+
+ ret.grain_min = -ret.grain_center;
+ ret.grain_max = (256 << (bits - 8)) - 1 - ret.grain_center;
+
+ struct pl_color_repr repr = *params->repr;
+ ret.texture_scale = pl_color_repr_normalize(&repr);
+
+ // Since our color samples are normalized to the range [0, 1], we need to
+ // scale down grain values from the scale [0, 2^b - 1] to this range.
+ ret.grain_scale = 1.0 / ((1 << bits) - 1);
+
+ return ret;
+}
+
+// Generates the basic grain table (LumaGrain in the spec).
+static void generate_grain_y(float out[GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT],
+ int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+ const struct pl_film_grain_params *params)
+{
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ struct grain_scale scale = get_grain_scale(params);
+ uint16_t seed = (uint16_t) params->data.seed;
+ int bits = bit_depth(params->repr);
+ int shift = 12 - bits + data->grain_scale_shift;
+ pl_assert(shift >= 0);
+
+ for (int y = 0; y < GRAIN_HEIGHT; y++) {
+ for (int x = 0; x < GRAIN_WIDTH; x++) {
+ int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+ buf[y][x] = round2(value, shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+ for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_y;
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ if (!dx && !dy)
+ break;
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+ buf[y][x] = grain;
+ }
+ }
+
+ for (int y = 0; y < GRAIN_HEIGHT_LUT; y++) {
+ for (int x = 0; x < GRAIN_WIDTH_LUT; x++) {
+ int16_t grain = buf[y + GRAIN_PAD_LUT][x + GRAIN_PAD_LUT];
+ out[y][x] = grain * scale.grain_scale;
+ }
+ }
+}
+
+static void generate_grain_uv(float *out, int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+ const int16_t buf_y[GRAIN_HEIGHT][GRAIN_WIDTH],
+ enum pl_channel channel, int sub_x, int sub_y,
+ const struct pl_film_grain_params *params)
+{
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ struct grain_scale scale = get_grain_scale(params);
+ int bits = bit_depth(params->repr);
+ int shift = 12 - bits + data->grain_scale_shift;
+ pl_assert(shift >= 0);
+
+ uint16_t seed = params->data.seed;
+ if (channel == PL_CHANNEL_CB) {
+ seed ^= 0xb524;
+ } else if (channel == PL_CHANNEL_CR) {
+ seed ^= 0x49d8;
+ }
+
+ int chromaW = sub_x ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
+ int chromaH = sub_y ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+ const int8_t *coeffs[] = {
+ [PL_CHANNEL_CB] = data->ar_coeffs_uv[0],
+ [PL_CHANNEL_CR] = data->ar_coeffs_uv[1],
+ };
+
+ for (int y = 0; y < chromaH; y++) {
+ for (int x = 0; x < chromaW; x++) {
+ int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+ buf[y][x] = round2(value, shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < chromaH; y++) {
+ for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+ const int8_t *coeff = coeffs[channel];
+ pl_assert(coeff);
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ // For the final (current) pixel, we need to add in the
+ // contribution from the luma grain texture
+ if (!dx && !dy) {
+ if (!data->num_points_y)
+ break;
+ int luma = 0;
+ int lumaX = ((x - ar_pad) << sub_x) + ar_pad;
+ int lumaY = ((y - ar_pad) << sub_y) + ar_pad;
+ for (int i = 0; i <= sub_y; i++) {
+ for (int j = 0; j <= sub_x; j++) {
+ luma += buf_y[lumaY + i][lumaX + j];
+ }
+ }
+ luma = round2(luma, sub_x + sub_y);
+ sum += luma * (*coeff);
+ break;
+ }
+
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+ buf[y][x] = grain;
+ }
+ }
+
+ int lutW = GRAIN_WIDTH_LUT >> sub_x;
+ int lutH = GRAIN_HEIGHT_LUT >> sub_y;
+ int padX = sub_x ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+ int padY = sub_y ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+
+ for (int y = 0; y < lutH; y++) {
+ for (int x = 0; x < lutW; x++) {
+ int16_t grain = buf[y + padY][x + padX];
+ out[y * lutW + x] = grain * scale.grain_scale;
+ }
+ }
+}
+
+static void generate_offsets(void *pbuf, const struct sh_lut_params *params)
+{
+ const struct pl_film_grain_data *data = params->priv;
+ unsigned int *buf = pbuf;
+ pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+
+ for (int y = 0; y < params->height; y++) {
+ uint16_t state = data->seed;
+ state ^= ((y * 37 + 178) & 0xFF) << 8;
+ state ^= ((y * 173 + 105) & 0xFF);
+
+ for (int x = 0; x < params->width; x++) {
+ unsigned int *offsets = &buf[y * params->width + x];
+
+ uint8_t val = get_random_number(8, &state);
+ uint8_t val_l = x ? (offsets - 1)[0] : 0;
+ uint8_t val_t = y ? (offsets - params->width)[0] : 0;
+ uint8_t val_tl = x && y ? (offsets - params->width - 1)[0] : 0;
+
+ // Encode four offsets into a single 32-bit integer for the
+ // convenience of the GPU. That way only one LUT fetch is
+ // required for the entire block.
+ *offsets = ((uint32_t) val_tl << OFFSET_TL)
+ | ((uint32_t) val_t << OFFSET_T)
+ | ((uint32_t) val_l << OFFSET_L)
+ | ((uint32_t) val << OFFSET_N);
+ }
+ }
+}
+
+static void generate_scaling(void *pdata, const struct sh_lut_params *params)
+{
+ assert(params->width == SCALING_LUT_SIZE && params->comps == 1);
+ float *data = pdata;
+
+ struct {
+ int num;
+ uint8_t (*points)[2];
+ const struct pl_av1_grain_data *data;
+ } *ctx = params->priv;
+
+ float range = 1 << ctx->data->scaling_shift;
+
+ // Fill up the preceding entries with the initial value
+ for (int i = 0; i < ctx->points[0][0]; i++)
+ data[i] = ctx->points[0][1] / range;
+
+ // Linearly interpolate the values in the middle
+ for (int i = 0; i < ctx->num - 1; i++) {
+ int bx = ctx->points[i][0];
+ int by = ctx->points[i][1];
+ int dx = ctx->points[i + 1][0] - bx;
+ int dy = ctx->points[i + 1][1] - by;
+ int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+ for (int x = 0; x < dx; x++) {
+ int v = by + ((x * delta + 0x8000) >> 16);
+ data[bx + x] = v / range;
+ }
+ }
+
+ // Fill up the remaining entries with the final value
+ for (int i = ctx->points[ctx->num - 1][0]; i < SCALING_LUT_SIZE; i++)
+ data[i] = ctx->points[ctx->num - 1][1] / range;
+}
+
+static void sample(pl_shader sh, enum offset off, ident_t lut, int idx,
+ int sub_x, int sub_y)
+{
+ int dx = (off & OFFSET_L) ? 1 : 0,
+ dy = (off & OFFSET_T) ? 1 : 0;
+
+ static const char *index_strs[] = {
+ [0] = ".x",
+ [1] = ".y",
+ };
+
+ GLSL("offset = uvec2(%du, %du) * uvec2((data >> %d) & 0xFu, \n"
+ " (data >> %d) & 0xFu);\n"
+ "pos = offset + local_id.xy + uvec2(%d, %d); \n"
+ "val = "$"(pos)%s; \n",
+ sub_x ? 1 : 2, sub_y ? 1 : 2, off + 4, off,
+ (BLOCK_SIZE >> sub_x) * dx,
+ (BLOCK_SIZE >> sub_y) * dy,
+ lut, idx >= 0 ? index_strs[idx] : "");
+}
+
+struct grain_obj_av1 {
+ // LUT objects for the offsets, grain and scaling luts
+ pl_shader_obj lut_offsets;
+ pl_shader_obj lut_grain[2];
+ pl_shader_obj lut_scaling[3];
+
+ // Previous parameters used to check reusability
+ struct pl_film_grain_data data;
+ struct pl_color_repr repr;
+ bool fg_has_y;
+ bool fg_has_u;
+ bool fg_has_v;
+
+ // Space to store the temporary arrays, reused
+ uint32_t *offsets;
+ float grain[2][GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT];
+ int16_t grain_tmp_y[GRAIN_HEIGHT][GRAIN_WIDTH];
+ int16_t grain_tmp_uv[GRAIN_HEIGHT][GRAIN_WIDTH];
+};
+
+static void av1_grain_uninit(pl_gpu gpu, void *ptr)
+{
+ struct grain_obj_av1 *obj = ptr;
+ pl_shader_obj_destroy(&obj->lut_offsets);
+ for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_grain); i++)
+ pl_shader_obj_destroy(&obj->lut_grain[i]);
+ for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_scaling); i++)
+ pl_shader_obj_destroy(&obj->lut_scaling[i]);
+ *obj = (struct grain_obj_av1) {0};
+}
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *params)
+{
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ bool has_y = data->num_points_y > 0;
+ bool has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+ bool has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+ for (int i = 0; i < 3; i++) {
+ enum pl_channel channel = channel_map(i, params);
+ if (channel == PL_CHANNEL_Y && has_y)
+ return true;
+ if (channel == PL_CHANNEL_CB && has_u)
+ return true;
+ if (channel == PL_CHANNEL_CR && has_v)
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool av1_grain_data_eq(const struct pl_film_grain_data *da,
+ const struct pl_film_grain_data *db)
+{
+ const struct pl_av1_grain_data *a = &da->params.av1, *b = &db->params.av1;
+
+ // Only check the fields that are relevant for grain LUT generation
+ return da->seed == db->seed &&
+ a->chroma_scaling_from_luma == b->chroma_scaling_from_luma &&
+ a->scaling_shift == b->scaling_shift &&
+ a->ar_coeff_lag == b->ar_coeff_lag &&
+ a->ar_coeff_shift == b->ar_coeff_shift &&
+ a->grain_scale_shift == b->grain_scale_shift &&
+ !memcmp(a->ar_coeffs_y, b->ar_coeffs_y, sizeof(a->ar_coeffs_y)) &&
+ !memcmp(a->ar_coeffs_uv, b->ar_coeffs_uv, sizeof(a->ar_coeffs_uv));
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+ struct grain_obj_av1 *obj = params->priv;
+ size_t entries = params->width * params->height * params->comps;
+ memcpy(data, obj->grain, entries * sizeof(float));
+}
+
+bool pl_shader_fg_av1(pl_shader sh, pl_shader_obj *grain_state,
+ const struct pl_film_grain_params *params)
+{
+ int sub_x = 0, sub_y = 0;
+ int tex_w = params->tex->params.w,
+ tex_h = params->tex->params.h;
+
+ if (params->luma_tex) {
+ sub_x = params->luma_tex->params.w > tex_w;
+ sub_y = params->luma_tex->params.h > tex_h;
+ }
+
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ bool fg_has_y = data->num_points_y > 0;
+ bool fg_has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+ bool fg_has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+ bool tex_is_y = false, tex_is_cb = false, tex_is_cr = false;
+ for (int i = 0; i < 3; i++) {
+ switch (channel_map(i, params)) {
+ case PL_CHANNEL_Y: tex_is_y = true; break;
+ case PL_CHANNEL_CB: tex_is_cb = true; break;
+ case PL_CHANNEL_CR: tex_is_cr = true; break;
+ default: break;
+ };
+ }
+
+ if (tex_is_y && (sub_x || sub_y)) {
+ PL_WARN(sh, "pl_film_grain_params.channels includes PL_CHANNEL_Y but "
+ "plane is subsampled, this makes no sense. Continuing anyway "
+ "but output is likely incorrect.");
+ }
+
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, tex_w, tex_h))
+ return false;
+
+ pl_gpu gpu = SH_GPU(sh);
+ if (!gpu) {
+ PL_ERR(sh, "AV1 film grain synthesis requires a non-NULL pl_gpu!");
+ return false;
+ }
+
+ // Disable generation for unneeded component types
+ fg_has_y &= tex_is_y;
+ fg_has_u &= tex_is_cb;
+ fg_has_v &= tex_is_cr;
+
+ int bw = BLOCK_SIZE >> sub_x;
+ int bh = BLOCK_SIZE >> sub_y;
+ bool is_compute = sh_try_compute(sh, bw, bh, false, sizeof(uint32_t));
+
+ struct grain_obj_av1 *obj;
+ obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_AV1_GRAIN,
+ struct grain_obj_av1, av1_grain_uninit);
+ if (!obj)
+ return false;
+
+ // Note: In theory we could check only the parameters related to luma or
+ // only related to chroma and skip updating for changes to irrelevant
+ // parts, but this is probably not worth it since the seed is expected to
+ // change per frame anyway.
+ bool needs_update = !av1_grain_data_eq(&params->data, &obj->data) ||
+ !pl_color_repr_equal(params->repr, &obj->repr) ||
+ fg_has_y != obj->fg_has_y ||
+ fg_has_u != obj->fg_has_u ||
+ fg_has_v != obj->fg_has_v;
+
+ if (needs_update) {
+ // This is needed even for chroma, so statically generate it
+ generate_grain_y(obj->grain[0], obj->grain_tmp_y, params);
+ }
+
+ ident_t lut[3];
+ int idx[3] = {-1};
+
+ if (fg_has_y) {
+ lut[0] = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_grain[0],
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .width = GRAIN_WIDTH_LUT,
+ .height = GRAIN_HEIGHT_LUT,
+ .comps = 1,
+ .update = needs_update,
+ .dynamic = true,
+ .fill = fill_grain_lut,
+ .priv = obj,
+ ));
+
+ if (!lut[0]) {
+ SH_FAIL(sh, "Failed generating/uploading luma grain LUT!");
+ return false;
+ }
+ }
+
+ // Try merging the chroma LUTs into a single texture
+ int chroma_comps = 0;
+ if (fg_has_u) {
+ generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+ obj->grain_tmp_y, PL_CHANNEL_CB, sub_x, sub_y,
+ params);
+ idx[1] = chroma_comps++;
+ }
+ if (fg_has_v) {
+ generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+ obj->grain_tmp_y, PL_CHANNEL_CR, sub_x, sub_y,
+ params);
+ idx[2] = chroma_comps++;
+ }
+
+ if (chroma_comps > 0) {
+ lut[1] = lut[2] = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_grain[1],
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .width = GRAIN_WIDTH_LUT >> sub_x,
+ .height = GRAIN_HEIGHT_LUT >> sub_y,
+ .comps = chroma_comps,
+ .update = needs_update,
+ .dynamic = true,
+ .fill = fill_grain_lut,
+ .priv = obj,
+ ));
+
+ if (!lut[1]) {
+ SH_FAIL(sh, "Failed generating/uploading chroma grain LUT!");
+ return false;
+ }
+
+ if (chroma_comps == 1)
+ idx[1] = idx[2] = -1;
+ }
+
+ ident_t offsets = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_offsets,
+ .var_type = PL_VAR_UINT,
+ .lut_type = SH_LUT_AUTO,
+ .width = PL_ALIGN2(tex_w << sub_x, 128) / 32,
+ .height = PL_ALIGN2(tex_h << sub_y, 128) / 32,
+ .comps = 1,
+ .update = needs_update,
+ .dynamic = true,
+ .fill = generate_offsets,
+ .priv = (void *) &params->data,
+ ));
+
+ if (!offsets) {
+ SH_FAIL(sh, "Failed generating/uploading block offsets LUT!");
+ return false;
+ }
+
+ // For the scaling LUTs, we assume they'll be relatively constant
+ // throughout the video so doing some extra work to avoid reinitializing
+ // them constantly is probably worth it. Probably.
+ const struct pl_av1_grain_data *obj_data = &obj->data.params.av1;
+ bool scaling_changed = false;
+ if (fg_has_y || data->chroma_scaling_from_luma) {
+ scaling_changed |= data->num_points_y != obj_data->num_points_y;
+ scaling_changed |= memcmp(data->points_y, obj_data->points_y,
+ sizeof(data->points_y));
+ }
+
+ if (fg_has_u && !data->chroma_scaling_from_luma) {
+ scaling_changed |= data->num_points_uv[0] != obj_data->num_points_uv[0];
+ scaling_changed |= memcmp(data->points_uv[0],
+ obj_data->points_uv[0],
+ sizeof(data->points_uv[0]));
+ }
+
+ if (fg_has_v && !data->chroma_scaling_from_luma) {
+ scaling_changed |= data->num_points_uv[1] != obj_data->num_points_uv[1];
+ scaling_changed |= memcmp(data->points_uv[1],
+ obj_data->points_uv[1],
+ sizeof(data->points_uv[1]));
+ }
+
+ ident_t scaling[3] = {0};
+ for (int i = 0; i < 3; i++) {
+ struct {
+ int num;
+ const uint8_t (*points)[2];
+ const struct pl_av1_grain_data *data;
+ } priv;
+
+ priv.data = data;
+ if (i == 0 || data->chroma_scaling_from_luma) {
+ priv.num = data->num_points_y;
+ priv.points = &data->points_y[0];
+ } else {
+ priv.num = data->num_points_uv[i - 1];
+ priv.points = &data->points_uv[i - 1][0];
+ }
+
+ // Skip scaling for unneeded channels
+ bool has_c[3] = { fg_has_y, fg_has_u, fg_has_v };
+ if (has_c[i] && priv.num > 0) {
+ scaling[i] = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_scaling[i],
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = SCALING_LUT_SIZE,
+ .comps = 1,
+ .update = scaling_changed,
+ .dynamic = true,
+ .fill = generate_scaling,
+ .priv = &priv,
+ ));
+
+ if (!scaling[i]) {
+ SH_FAIL(sh, "Failed generating/uploading scaling LUTs!");
+ return false;
+ }
+ }
+ }
+
+ // Done updating LUTs
+ obj->data = params->data;
+ obj->repr = *params->repr;
+ obj->fg_has_y = fg_has_y;
+ obj->fg_has_u = fg_has_u;
+ obj->fg_has_v = fg_has_v;
+
+ sh_describe(sh, "AV1 film grain");
+ GLSL("vec4 color; \n"
+ "// pl_shader_film_grain (AV1) \n"
+ "{ \n"
+ "uvec2 offset; \n"
+ "uvec2 pos; \n"
+ "float val; \n"
+ "float grain; \n");
+
+ if (is_compute) {
+ GLSL("uvec2 block_id = gl_WorkGroupID.xy; \n"
+ "uvec2 local_id = gl_LocalInvocationID.xy; \n"
+ "uvec2 global_id = gl_GlobalInvocationID.xy; \n");
+ } else {
+ GLSL("uvec2 global_id = uvec2(gl_FragCoord); \n"
+ "uvec2 block_id = global_id / uvec2(%d, %d); \n"
+ "uvec2 local_id = global_id - uvec2(%d, %d) * block_id; \n",
+ bw, bh, bw, bh);
+ }
+
+ // Load the data vector which holds the offsets
+ if (is_compute) {
+ ident_t id = sh_fresh(sh, "data");
+ GLSLH("shared uint "$"; \n", id);
+ GLSL("if (gl_LocalInvocationIndex == 0u) \n"
+ " "$" = uint("$"(block_id)); \n"
+ "barrier(); \n"
+ "uint data = "$"; \n",
+ id, offsets, id);
+ } else {
+ GLSL("uint data = uint("$"(block_id)); \n", offsets);
+ }
+
+ struct grain_scale scale = get_grain_scale(params);
+ pl_color_repr_normalize(params->repr);
+ int bits = PL_DEF(params->repr->bits.color_depth, 8);
+ pl_assert(bits >= 8);
+
+ ident_t minValue, maxLuma, maxChroma;
+ if (pl_color_levels_guess(params->repr) == PL_COLOR_LEVELS_LIMITED) {
+ float out_scale = (1 << bits) / ((1 << bits) - 1.0);
+ minValue = SH_FLOAT(16 / 256.0 * out_scale);
+ maxLuma = SH_FLOAT(235 / 256.0 * out_scale);
+ maxChroma = SH_FLOAT(240 / 256.0 * out_scale);
+ if (!pl_color_system_is_ycbcr_like(params->repr->sys))
+ maxChroma = maxLuma;
+ } else {
+ minValue = SH_FLOAT(0.0);
+ maxLuma = SH_FLOAT(1.0);
+ maxChroma = SH_FLOAT(1.0);
+ }
+
+ // Load the color value of the tex itself
+ ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->tex,
+ .desc = (struct pl_desc) {
+ .name = "tex",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ ident_t tex_scale = SH_FLOAT(scale.texture_scale);
+ GLSL("color = vec4("$") * texelFetch("$", ivec2(global_id), 0); \n",
+ tex_scale, tex);
+
+ // If we need access to the external luma plane, load it now
+ if (tex_is_cb || tex_is_cr) {
+ GLSL("float averageLuma; \n");
+ if (tex_is_y) {
+ // We already have the luma channel as part of the pre-sampled color
+ for (int i = 0; i < 3; i++) {
+ if (channel_map(i, params) == PL_CHANNEL_Y) {
+ GLSL("averageLuma = color["$"]; \n", SH_INT(i));
+ break;
+ }
+ }
+ } else {
+ // Luma channel not present in image, attach it separately
+ pl_assert(params->luma_tex);
+ ident_t luma = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->luma_tex,
+ .desc = (struct pl_desc) {
+ .name = "luma",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ GLSL("pos = global_id * uvec2(%du, %du); \n"
+ "averageLuma = texelFetch("$", ivec2(pos), 0)["$"]; \n"
+ "averageLuma *= "$"; \n",
+ 1 << sub_x, 1 << sub_y,
+ luma, SH_INT(params->luma_comp),
+ tex_scale);
+ }
+ }
+
+ ident_t grain_min = SH_FLOAT(scale.grain_min * scale.grain_scale);
+ ident_t grain_max = SH_FLOAT(scale.grain_max * scale.grain_scale);
+
+ for (int i = 0; i < params->components; i++) {
+ enum pl_channel c = channel_map(i, params);
+ if (c == PL_CHANNEL_NONE)
+ continue;
+ if (!scaling[c])
+ continue;
+
+ sample(sh, OFFSET_N, lut[c], idx[c], sub_x, sub_y);
+ GLSL("grain = val; \n");
+
+ if (data->overlap) {
+ const char *weights[] = { "vec2(27.0, 17.0)", "vec2(23.0, 22.0)" };
+
+ // X-direction overlapping
+ GLSL("if (block_id.x > 0u && local_id.x < %du) { \n"
+ "vec2 w = %s / 32.0; \n"
+ "if (local_id.x == 1u) w.xy = w.yx; \n",
+ 2 >> sub_x, weights[sub_x]);
+ sample(sh, OFFSET_L, lut[c], idx[c], sub_x, sub_y);
+ GLSL("grain = dot(vec2(val, grain), w); \n"
+ "} \n");
+
+ // Y-direction overlapping
+ GLSL("if (block_id.y > 0u && local_id.y < %du) { \n"
+ "vec2 w = %s / 32.0; \n"
+ "if (local_id.y == 1u) w.xy = w.yx; \n",
+ 2 >> sub_y, weights[sub_y]);
+
+ // We need to special-case the top left pixels since these need to
+ // pre-blend the top-left offset block before blending vertically
+ GLSL(" if (block_id.x > 0u && local_id.x < %du) {\n"
+ " vec2 w2 = %s / 32.0; \n"
+ " if (local_id.x == 1u) w2.xy = w2.yx; \n",
+ 2 >> sub_x, weights[sub_x]);
+ sample(sh, OFFSET_TL, lut[c], idx[c], sub_x, sub_y);
+ GLSL(" float tmp = val; \n");
+ sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+ GLSL(" val = dot(vec2(tmp, val), w2); \n"
+ " } else { \n");
+ sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+ GLSL(" } \n"
+ "grain = dot(vec2(val, grain), w); \n"
+ "} \n");
+
+ // Correctly clip the interpolated grain
+ GLSL("grain = clamp(grain, "$", "$"); \n", grain_min, grain_max);
+ }
+
+ if (c == PL_CHANNEL_Y) {
+ GLSL("color[%d] += "$"(color[%d]) * grain; \n"
+ "color[%d] = clamp(color[%d], "$", "$"); \n",
+ i, scaling[c], i,
+ i, i, minValue, maxLuma);
+ } else {
+ GLSL("val = averageLuma; \n");
+ if (!data->chroma_scaling_from_luma) {
+ // We need to load some extra variables for the mixing. Do this
+ // using sh_var instead of hard-coding them to avoid shader
+ // recompilation when these values change.
+ ident_t mult = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("mult"),
+ .data = &(float[2]){
+ data->uv_mult_luma[c - 1] / 64.0,
+ data->uv_mult[c - 1] / 64.0,
+ },
+ });
+
+ int c_offset = (unsigned) data->uv_offset[c - 1] << (bits - 8);
+ ident_t offset = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("offset"),
+ .data = &(float) { c_offset * scale.grain_scale },
+ });
+
+ GLSL("val = dot(vec2(val, color[%d]), "$"); \n"
+ "val += "$"; \n",
+ i, mult, offset);
+ }
+ GLSL("color[%d] += "$"(val) * grain; \n"
+ "color[%d] = clamp(color[%d], "$", "$"); \n",
+ i, scaling[c],
+ i, i, minValue, maxChroma);
+ }
+ }
+
+ GLSL("} \n");
+ return true;
+}
diff --git a/src/shaders/film_grain_h274.c b/src/shaders/film_grain_h274.c
new file mode 100644
index 0000000..6d524da
--- /dev/null
+++ b/src/shaders/film_grain_h274.c
@@ -0,0 +1,815 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+static const int8_t Gaussian_LUT[2048+4];
+static const uint32_t Seed_LUT[256];
+static const int8_t R64T[64][64];
+
+static void prng_shift(uint32_t *state)
+{
+ // Primitive polynomial x^31 + x^3 + 1 (modulo 2)
+ uint32_t x = *state;
+ uint8_t feedback = 1u ^ (x >> 2) ^ (x >> 30);
+ *state = (x << 1) | (feedback & 1u);
+}
+
+
+static void generate_slice(float *out, size_t out_width, uint8_t h, uint8_t v,
+ int8_t grain[64][64], int16_t tmp[64][64])
+{
+ const uint8_t freq_h = ((h + 3) << 2) - 1;
+ const uint8_t freq_v = ((v + 3) << 2) - 1;
+ uint32_t seed = Seed_LUT[h + v * 13];
+
+ // Initialize with random gaussian values, using the output array as a
+ // temporary buffer for these intermediate values.
+ //
+ // Note: To make the subsequent matrix multiplication cache friendlier, we
+ // store each *column* of the starting image in a *row* of `grain`
+ for (int y = 0; y <= freq_v; y++) {
+ for (int x = 0; x <= freq_h; x += 4) {
+ uint16_t offset = seed % 2048;
+ grain[x + 0][y] = Gaussian_LUT[offset + 0];
+ grain[x + 1][y] = Gaussian_LUT[offset + 1];
+ grain[x + 2][y] = Gaussian_LUT[offset + 2];
+ grain[x + 3][y] = Gaussian_LUT[offset + 3];
+ prng_shift(&seed);
+ }
+ }
+
+ grain[0][0] = 0;
+
+ // 64x64 inverse integer transform
+ for (int y = 0; y < 64; y++) {
+ for (int x = 0; x <= freq_h; x++) {
+ int32_t sum = 0;
+ for (int p = 0; p <= freq_v; p++)
+ sum += R64T[y][p] * grain[x][p];
+ tmp[y][x] = (sum + 128) >> 8;
+ }
+ }
+
+ for (int y = 0; y < 64; y++) {
+ for (int x = 0; x < 64; x++) {
+ int32_t sum = 0;
+ for (int p = 0; p <= freq_h; p++)
+ sum += tmp[y][p] * R64T[x][p]; // R64T^T = R64
+ sum = (sum + 128) >> 8;
+ grain[y][x] = PL_CLAMP(sum, -127, 127);
+ }
+ }
+
+ static const uint8_t deblock_factors[13] = {
+ 64, 71, 77, 84, 90, 96, 103, 109, 116, 122, 128, 128, 128
+ };
+
+ // Deblock horizontal edges by simple attentuation of values
+ const uint8_t deblock_coeff = deblock_factors[v];
+ for (int y = 0; y < 64; y++) {
+ switch (y % 8) {
+ case 0: case 7:
+ // Deblock
+ for (int x = 0; x < 64; x++)
+ out[x] = ((grain[y][x] * deblock_coeff) >> 7) / 255.0;
+ break;
+
+ case 1: case 2:
+ case 3: case 4:
+ case 5: case 6:
+ // No deblock
+ for (int x = 0; x < 64; x++)
+ out[x] = grain[y][x] / 255.0;
+ break;
+
+ default: pl_unreachable();
+ }
+
+ out += out_width;
+ }
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+ struct {
+ int8_t grain[64][64];
+ int16_t tmp[64][64];
+ } *tmp = pl_alloc_ptr(NULL, tmp);
+
+ float *out = data;
+ assert(params->var_type == PL_VAR_FLOAT);
+
+ for (int h = 0; h < 13; h++) {
+ for (int v = 0; v < 13; v++) {
+ float *slice = out + (h * 64) * params->width + (v * 64);
+ generate_slice(slice, params->width, h, v, tmp->grain, tmp->tmp);
+ }
+ }
+
+ pl_free(tmp);
+}
+
+bool pl_needs_fg_h274(const struct pl_film_grain_params *params)
+{
+ const struct pl_h274_grain_data *data = &params->data.params.h274;
+ if (data->model_id != 0)
+ return false;
+
+ for (int i = 0; i < 3; i++) {
+ enum pl_channel channel = channel_map(i, params);
+ if (channel < 0 || channel >= 3)
+ continue;
+ if (data->component_model_present[channel])
+ return true;
+ }
+
+ return false;
+}
+
+bool pl_shader_fg_h274(pl_shader sh, pl_shader_obj *grain_state,
+ const struct pl_film_grain_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, params->tex->params.w, params->tex->params.h))
+ return false;
+
+ size_t shmem_req = 0;
+ ident_t group_sum = NULL_IDENT;
+
+ const struct pl_glsl_version glsl = sh_glsl(sh);
+ if (glsl.subgroup_size < 8*8) {
+ group_sum = sh_fresh(sh, "group_sum");
+ shmem_req += sizeof(int);
+ GLSLH("shared int "$"; \n", group_sum);
+ GLSL($" = 0; barrier(); \n", group_sum);
+ }
+
+ if (!sh_try_compute(sh, 8, 8, false, shmem_req)) {
+ SH_FAIL(sh, "H.274 film grain synthesis requires compute shaders!");
+ return false;
+ }
+
+ ident_t db = sh_lut(sh, sh_lut_params(
+ .object = grain_state,
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .width = 13 * 64,
+ .height = 13 * 64,
+ .comps = 1,
+ .fill = fill_grain_lut,
+ .signature = CACHE_KEY_H274, // doesn't depend on anything
+ .cache = SH_CACHE(sh),
+ ));
+
+ sh_describe(sh, "H.274 film grain");
+ GLSL("vec4 color; \n"
+ "// pl_shader_film_grain (H.274) \n"
+ "{ \n");
+
+ // Load the color value of the tex itself
+ ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->tex,
+ .desc = (struct pl_desc) {
+ .name = "tex",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ GLSL("ivec2 pos = ivec2(gl_GlobalInvocationID); \n"
+ "color = vec4("$") * texelFetch("$", pos, 0); \n",
+ SH_FLOAT(pl_color_repr_normalize(params->repr)), tex);
+
+ const struct pl_h274_grain_data *data = &params->data.params.h274;
+ ident_t scale_factor = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("scale_factor"),
+ .data = &(float){ 1.0 / (1 << (data->log2_scale_factor + 6)) },
+ });
+
+ // pcg3d (http://www.jcgt.org/published/0009/03/02/)
+ GLSL("uvec3 pcg = uvec3("$", gl_WorkGroupID.xy / 2u); \n"
+ "pcg = pcg * 1664525u + 1013904223u; \n"
+ "pcg.x += pcg.y * pcg.z; \n"
+ "pcg.y += pcg.z * pcg.x; \n"
+ "pcg.z += pcg.x * pcg.y; \n"
+ "pcg ^= pcg >> 16u; \n"
+ "pcg.x += pcg.y * pcg.z; \n"
+ "pcg.y += pcg.z * pcg.x; \n"
+ "pcg.z += pcg.x * pcg.y; \n",
+ sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_uint("seed"),
+ .data = &(unsigned int){ params->data.seed },
+ }));
+
+ for (int idx = 0; idx < params->components; idx++) {
+ enum pl_channel c = channel_map(idx, params);
+ if (c == PL_CHANNEL_NONE)
+ continue;
+ if (!data->component_model_present[c])
+ continue;
+
+ GLSL("// component %d\n{\n", c);
+
+ // Compute the local 8x8 average
+ GLSL("float avg = color[%d] / 64.0; \n", c);
+
+ const int precision = 10000000;
+ if (glsl.subgroup_size) {
+ GLSL("avg = subgroupAdd(avg); \n");
+
+ if (glsl.subgroup_size < 8*8) {
+ GLSL("if (subgroupElect()) \n"
+ " atomicAdd("$", int(avg * %d.0)); \n"
+ "barrier(); \n"
+ "avg = float("$") / %d.0; \n",
+ group_sum, precision, group_sum, precision);
+ }
+ } else {
+ GLSL("atomicAdd("$", int(avg * %d.0)); \n"
+ "barrier(); \n"
+ "avg = float("$") / %d.0; \n",
+ group_sum, precision, group_sum, precision);
+ }
+
+ // Hard-coded unrolled loop, to avoid having to load a dynamically
+ // sized array into the shader - and to optimize for the very common
+ // case of there only being a single intensity interval
+ GLSL("uint val; \n");
+ for (int i = 0; i < data->num_intensity_intervals[c]; i++) {
+ ident_t bounds = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("bounds"),
+ .data = &(float[2]) {
+ data->intensity_interval_lower_bound[c][i] / 255.0,
+ data->intensity_interval_upper_bound[c][i] / 255.0,
+ },
+ });
+
+ const uint8_t num_values = data->num_model_values[c];
+ uint8_t h = num_values > 1 ? data->comp_model_value[c][i][1] : 8;
+ uint8_t v = num_values > 2 ? data->comp_model_value[c][i][2] : h;
+ h = PL_CLAMP(h, 2, 14) - 2;
+ v = PL_CLAMP(v, 2, 14) - 2;
+ // FIXME: double h/v for subsampled planes!
+
+ // Reduce scale for chroma planes
+ int16_t scale = data->comp_model_value[c][i][0];
+ if (c > 0 && pl_color_system_is_ycbcr_like(params->repr->sys))
+ scale >>= 1;
+
+ pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+ ident_t values = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_uint("comp_model_value"),
+ .data = &(unsigned int) {
+ (uint16_t) scale << 16 | h << 8 | v,
+ },
+ });
+
+ GLSL("if (avg >= "$".x && avg <= "$".y) \n"
+ " val = "$"; else \n",
+ bounds, bounds, values);
+ }
+ GLSL(" val = 0u; \n");
+
+ // Extract the grain parameters from comp_model_value
+ GLSL("uvec2 offset = uvec2((val & 0xFF00u) >> 2, \n"
+ " (val & 0xFFu) << 6); \n"
+ "float scale = "$" * float(int(val >> 16)); \n"
+ // Add randomness
+ "uint rand = pcg[%d]; \n"
+ "offset.x += (rand >> 16u) %% 52u; \n"
+ "offset.y += (rand & 0xFFFFu) %% 56u; \n"
+ "offset.x &= 0xFFFCu; \n"
+ "offset.y &= 0xFFF8u; \n"
+ "if ((rand & 1u) == 1u) scale = -scale; \n"
+ // Add local offset and compute grain
+ "offset += 8u * (gl_WorkGroupID.xy %% 2u); \n"
+ "offset += gl_LocalInvocationID.xy; \n"
+ "float grain = "$"(offset); \n"
+ "color[%d] += scale * grain; \n",
+ scale_factor, c, db, c);
+
+ // TODO: Deblocking?
+
+ GLSL("}\n");
+ }
+
+ GLSL("} \n");
+ return true;
+}
+
+// These tables are all taken from the SMPTE RDD 5-2006 specification
+static const int8_t Gaussian_LUT[2048+4] = {
+ -11, 12, 103, -11, 42, -35, 12, 59, 77, 98, -87, 3, 65, -78, 45, 56, -51, 21,
+ 13, -11, -20, -19, 33, -127, 17, -6, -105, 18, 19, 71, 48, -10, -38, 42,
+ -2, 75, -67, 52, -90, 33, -47, 21, -3, -56, 49, 1, -57, -42, -1, 120, -127,
+ -108, -49, 9, 14, 127, 122, 109, 52, 127, 2, 7, 114, 19, 30, 12, 77, 112,
+ 82, -61, -127, 111, -52, -29, 2, -49, -24, 58, -29, -73, 12, 112, 67, 79,
+ -3, -114, -87, -6, -5, 40, 58, -81, 49, -27, -31, -34, -105, 50, 16, -24,
+ -35, -14, -15, -127, -55, -22, -55, -127, -112, 5, -26, -72, 127, 127, -2,
+ 41, 87, -65, -16, 55, 19, 91, -81, -65, -64, 35, -7, -54, 99, -7, 88, 125,
+ -26, 91, 0, 63, 60, -14, -23, 113, -33, 116, 14, 26, 51, -16, 107, -8, 53,
+ 38, -34, 17, -7, 4, -91, 6, 63, 63, -15, 39, -36, 19, 55, 17, -51, 40, 33,
+ -37, 126, -39, -118, 17, -30, 0, 19, 98, 60, 101, -12, -73, -17, -52, 98,
+ 3, 3, 60, 33, -3, -2, 10, -42, -106, -38, 14, 127, 16, -127, -31, -86, -39,
+ -56, 46, -41, 75, 23, -19, -22, -70, 74, -54, -2, 32, -45, 17, -92, 59,
+ -64, -67, 56, -102, -29, -87, -34, -92, 68, 5, -74, -61, 93, -43, 14, -26,
+ -38, -126, -17, 16, -127, 64, 34, 31, 93, 17, -51, -59, 71, 77, 81, 127,
+ 127, 61, 33, -106, -93, 0, 0, 75, -69, 71, 127, -19, -111, 30, 23, 15, 2,
+ 39, 92, 5, 42, 2, -6, 38, 15, 114, -30, -37, 50, 44, 106, 27, 119, 7, -80,
+ 25, -68, -21, 92, -11, -1, 18, 41, -50, 79, -127, -43, 127, 18, 11, -21,
+ 32, -52, 27, -88, -90, -39, -19, -10, 24, -118, 72, -24, -44, 2, 12, 86,
+ -107, 39, -33, -127, 47, 51, -24, -22, 46, 0, 15, -35, -69, -2, -74, 24,
+ -6, 0, 29, -3, 45, 32, -32, 117, -45, 79, -24, -17, -109, -10, -70, 88,
+ -48, 24, -91, 120, -37, 50, -127, 58, 32, -82, -10, -17, -7, 46, -127, -15,
+ 89, 127, 17, 98, -39, -33, 37, 42, -40, -32, -21, 105, -19, 19, 19, -59,
+ -9, 30, 0, -127, 34, 127, -84, 75, 24, -40, -49, -127, -107, -14, 45, -75,
+ 1, 30, -20, 41, -68, -40, 12, 127, -3, 5, 20, -73, -59, -127, -3, -3, -53,
+ -6, -119, 93, 120, -80, -50, 0, 20, -46, 67, 78, -12, -22, -127, 36, -41,
+ 56, 119, -5, -116, -22, 68, -14, -90, 24, -82, -44, -127, 107, -25, -37,
+ 40, -7, -7, -82, 5, -87, 44, -34, 9, -127, 39, 70, 49, -63, 74, -49, 109,
+ -27, -89, -47, -39, 44, 49, -4, 60, -42, 80, 9, -127, -9, -56, -49, 125,
+ -66, 47, 36, 117, 15, -11, -96, 109, 94, -17, -56, 70, 8, -14, -5, 50, 37,
+ -45, 120, -30, -76, 40, -46, 6, 3, 69, 17, -78, 1, -79, 6, 127, 43, 26,
+ 127, -127, 28, -55, -26, 55, 112, 48, 107, -1, -77, -1, 53, -9, -22, -43,
+ 123, 108, 127, 102, 68, 46, 5, 1, 123, -13, -55, -34, -49, 89, 65, -105,
+ -5, 94, -53, 62, 45, 30, 46, 18, -35, 15, 41, 47, -98, -24, 94, -75, 127,
+ -114, 127, -68, 1, -17, 51, -95, 47, 12, 34, -45, -75, 89, -107, -9, -58,
+ -29, -109, -24, 127, -61, -13, 77, -45, 17, 19, 83, -24, 9, 127, -66, 54,
+ 4, 26, 13, 111, 43, -113, -22, 10, -24, 83, 67, -14, 75, -123, 59, 127,
+ -12, 99, -19, 64, -38, 54, 9, 7, 61, -56, 3, -57, 113, -104, -59, 3, -9,
+ -47, 74, 85, -55, -34, 12, 118, 28, 93, -72, 13, -99, -72, -20, 30, 72,
+ -94, 19, -54, 64, -12, -63, -25, 65, 72, -10, 127, 0, -127, 103, -20, -73,
+ -112, -103, -6, 28, -42, -21, -59, -29, -26, 19, -4, -51, 94, -58, -95,
+ -37, 35, 20, -69, 127, -19, -127, -22, -120, -53, 37, 74, -127, -1, -12,
+ -119, -53, -28, 38, 69, 17, 16, -114, 89, 62, 24, 37, -23, 49, -101, -32,
+ -9, -95, -53, 5, 93, -23, -49, -8, 51, 3, -75, -90, -10, -39, 127, -86,
+ -22, 20, 20, 113, 75, 52, -31, 92, -63, 7, -12, 46, 36, 101, -43, -17, -53,
+ -7, -38, -76, -31, -21, 62, 31, 62, 20, -127, 31, 64, 36, 102, -85, -10,
+ 77, 80, 58, -79, -8, 35, 8, 80, -24, -9, 3, -17, 72, 127, 83, -87, 55, 18,
+ -119, -123, 36, 10, 127, 56, -55, 113, 13, 26, 32, -13, -48, 22, -13, 5,
+ 58, 27, 24, 26, -11, -36, 37, -92, 78, 81, 9, 51, 14, 67, -13, 0, 32, 45,
+ -76, 32, -39, -22, -49, -127, -27, 31, -9, 36, 14, 71, 13, 57, 12, -53,
+ -86, 53, -44, -35, 2, 127, 12, -66, -44, 46, -115, 3, 10, 56, -35, 119,
+ -19, -61, 52, -59, -127, -49, -23, 4, -5, 17, -82, -6, 127, 25, 79, 67, 64,
+ -25, 14, -64, -37, -127, -28, 21, -63, 66, -53, -41, 109, -62, 15, -22, 13,
+ 29, -63, 20, 27, 95, -44, -59, -116, -10, 79, -49, 22, -43, -16, 46, -47,
+ -120, -36, -29, -52, -44, 29, 127, -13, 49, -9, -127, 75, -28, -23, 88, 59,
+ 11, -95, 81, -59, 58, 60, -26, 40, -92, -3, -22, -58, -45, -59, -22, -53,
+ 71, -29, 66, -32, -23, 14, -17, -66, -24, -28, -62, 47, 38, 17, 16, -37,
+ -24, -11, 8, -27, -19, 59, 45, -49, -47, -4, -22, -81, 30, -67, -127, 74,
+ 102, 5, -18, 98, 34, -66, 42, -52, 7, -59, 24, -58, -19, -24, -118, -73,
+ 91, 15, -16, 79, -32, -79, -127, -36, 41, 77, -83, 2, 56, 22, -75, 127,
+ -16, -21, 12, 31, 56, -113, -127, 90, 55, 61, 12, 55, -14, -113, -14, 32,
+ 49, -67, -17, 91, -10, 1, 21, 69, -70, 99, -19, -112, 66, -90, -10, -9,
+ -71, 127, 50, -81, -49, 24, 61, -61, -111, 7, -41, 127, 88, -66, 108, -127,
+ -6, 36, -14, 41, -50, 14, 14, 73, -101, -28, 77, 127, -8, -100, 88, 38,
+ 121, 88, -125, -60, 13, -94, -115, 20, -67, -87, -94, -119, 44, -28, -30,
+ 18, 5, -53, -61, 20, -43, 11, -77, -60, 13, 29, 3, 6, -72, 38, -60, -11,
+ 108, -53, 41, 66, -12, -127, -127, -49, 24, 29, 46, 36, 91, 34, -33, 116,
+ -51, -34, -52, 91, 7, -83, 73, -26, -103, 24, -10, 76, 84, 5, 68, -80, -13,
+ -17, -32, -48, 20, 50, 26, 10, 63, -104, -14, 37, 127, 114, 97, 35, 1, -33,
+ -55, 127, -124, -33, 61, -7, 119, -32, -127, -53, -42, 63, 3, -5, -26, 70,
+ -58, -33, -44, -43, 34, -56, -127, 127, 25, -35, -11, 16, -81, 29, -58, 40,
+ -127, -127, 20, -47, -11, -36, -63, -52, -32, -82, 78, -76, -73, 8, 27,
+ -72, -9, -74, -85, -86, -57, 25, 78, -10, -97, 35, -65, 8, -59, 14, 1, -42,
+ 32, -88, -44, 17, -3, -9, 59, 40, 12, -108, -40, 24, 34, 18, -28, 2, 51,
+ -110, -4, 100, 1, 65, 22, 0, 127, 61, 45, 25, -31, 6, 9, -7, -48, 99, 16,
+ 44, -2, -40, 32, -39, -52, 10, -110, -19, 56, -127, 69, 26, 51, 92, 40, 61,
+ -52, 45, -38, 13, 85, 122, 27, 66, 45, -111, -83, -3, 31, 37, 19, -36, 58,
+ 71, 39, -78, -47, 58, -78, 8, -62, -36, -14, 61, 42, -127, 71, -4, 24, -54,
+ 52, -127, 67, -4, -42, 30, -63, 59, -3, -1, -18, -46, -92, -81, -96, -14,
+ -53, -10, -11, -77, 13, 1, 8, -67, -127, 127, -28, 26, -14, 18, -13, -26,
+ 2, 10, -46, -32, -15, 27, -31, -59, 59, 77, -121, 28, 40, -54, -62, -31,
+ -21, -37, -32, -6, -127, -25, -60, 70, -127, 112, -127, 127, 88, -7, 116,
+ 110, 53, 87, -127, 3, 16, 23, 74, -106, -51, 3, 74, -82, -112, -74, 65, 81,
+ 25, 53, 127, -45, -50, -103, -41, -65, -29, 79, -67, 64, -33, -30, -8, 127,
+ 0, -13, -51, 67, -14, 5, -92, 29, -35, -8, -90, -57, -3, 36, 43, 44, -31,
+ -69, -7, 36, 39, -51, 43, -81, 58, 6, 127, 12, 57, 66, 46, 59, -43, -42,
+ 41, -15, -120, 24, 3, -11, 19, -13, 51, 28, 3, 55, -48, -12, -1, 2, 97,
+ -19, 29, 42, 13, 43, 78, -44, 56, -108, -43, -19, 127, 15, -11, -18, -81,
+ 83, -37, 77, -109, 15, 65, -50, 43, 12, 13, 27, 28, 61, 57, 30, 26, 106,
+ -18, 56, 13, 97, 4, -8, -62, -103, 94, 108, -44, 52, 27, -47, -9, 105, -53,
+ 46, 89, 103, -33, 38, -34, 55, 51, 70, -94, -35, -87, -107, -19, -31, 9,
+ -19, 79, -14, 77, 5, -19, -107, 85, 21, -45, -39, -42, 9, -29, 74, 47, -75,
+ 60, -127, 120, -112, -57, -32, 41, 7, 79, 76, 66, 57, 41, -25, 31, 37, -47,
+ -36, 43, -73, -37, 63, 127, -69, -52, 90, -33, -61, 60, -55, 44, 15, 4,
+ -67, 13, -92, 64, 29, -39, -3, 83, -2, -38, -85, -86, 58, 35, -69, -61, 29,
+ -37, -95, -78, 4, 30, -4, -32, -80, -22, -9, -77, 46, 7, -93, -71, 65, 9,
+ -50, 127, -70, 26, -12, -39, -114, 63, -127, -100, 4, -32, 111, 22, -60,
+ 65, -101, 26, -42, 21, -59, -27, -74, 2, -94, 6, 126, 5, 76, -88, -9, -43,
+ -101, 127, 1, 125, 92, -63, 52, 56, 4, 81, -127, 127, 80, 127, -29, 30,
+ 116, -74, -17, -57, 105, 48, 45, 25, -72, 48, -38, -108, 31, -34, 4, -11,
+ 41, -127, 52, -104, -43, -37, 52, 2, 47, 87, -9, 77, 27, -41, -25, 90, 86,
+ -56, 75, 10, 33, 78, 58, 127, 127, -7, -73, 49, -33, -106, -35, 38, 57, 53,
+ -17, -4, 83, 52, -108, 54, -125, 28, 23, 56, -43, -88, -17, -6, 47, 23, -9,
+ 0, -13, 111, 75, 27, -52, -38, -34, 39, 30, 66, 39, 38, -64, 38, 3, 21,
+ -32, -51, -28, 54, -38, -87, 20, 52, 115, 18, -81, -70, 0, -14, -46, -46,
+ -3, 125, 16, -14, 23, -82, -84, -69, -20, -65, -127, 9, 81, -49, 61, 7,
+ -36, -45, -42, 57, -26, 47, 20, -85, 46, -13, 41, -37, -75, -60, 86, -78,
+ -127, 12, 50, 2, -3, 13, 47, 5, 19, -78, -55, -27, 65, -71, 12, -108, 20,
+ -16, 11, -31, 63, -55, 37, 75, -17, 127, -73, -33, -28, -120, 105, 68, 106,
+ -103, -106, 71, 61, 2, 23, -3, 33, -5, -15, -67, -15, -23, -54, 15, -63,
+ 76, 58, -110, 1, 83, -27, 22, 75, -39, -17, -11, 64, -17, -127, -54, -66,
+ 31, 96, 116, 3, -114, -7, -108, -63, 97, 9, 50, 8, 75, -28, 72, 112, -36,
+ -112, 95, -50, 23, -13, -19, 55, 21, 23, 92, 91, 22, -49, 16, -75, 23, 9,
+ -49, -97, -37, 49, -36, 36, -127, -86, 43, 127, -24, -24, 84, 83, -35, -34,
+ -12, 109, 102, -38, 51, -68, 34, 19, -22, 49, -32, 127, 40, 24, -93, -4,
+ -3, 105, 3, -58, -18, 8, 127, -18, 125, 68, 69, -62, 30, -36, 54, -57, -24,
+ 17, 43, -36, -27, -57, -67, -21, -10, -49, 68, 12, 65, 4, 48, 55, 127, -75,
+ 44, 89, -66, -13, -78, -82, -91, 22, 30, 33, -40, -87, -34, 96, -91, 39,
+ 10, -64, -3, -12, 127, -50, -37, -56, 23, -35, -36, -54, 90, -91, 2, 50,
+ 77, -6, -127, 16, 46, -5, -73, 0, -56, -18, -72, 28, 93, 60, 49, 20, 18,
+ 111, -111, 32, -83, 47, 47, -10, 35, -88, 43, 57, -98, 127, -17, 0, 1, -39,
+ -127, -2, 0, 63, 93, 0, 36, -66, -61, -19, 39, -127, 58, 50, -17, 127, 88,
+ -43, -108, -51, -16, 7, -36, 68, 46, -14, 107, 40, 57, 7, 19, 8, 3, 88,
+ -90, -92, -18, -21, -24, 13, 7, -4, -78, -91, -4, 8, -35, -5, 19, 2, -111,
+ 4, -66, -81, 122, -20, -34, -37, -84, 127, 68, 46, 17, 47,
+
+ // Repeat the beginning of the array to allow wrapping reads
+ -11, 12, 103, -11,
+};
+
+static const uint32_t Seed_LUT[256] = {
+ 747538460, 1088979410, 1744950180, 1767011913, 1403382928,
+ 521866116, 1060417601, 2110622736, 1557184770, 105289385, 585624216,
+ 1827676546, 1191843873, 1018104344, 1123590530, 663361569, 2023850500,
+ 76561770, 1226763489, 80325252, 1992581442, 502705249, 740409860,
+ 516219202, 557974537, 1883843076, 720112066, 1640137737, 1820967556,
+ 40667586, 155354121, 1820967557, 1115949072, 1631803309, 98284748,
+ 287433856, 2119719977, 988742797, 1827432592, 579378475, 1017745956,
+ 1309377032, 1316535465, 2074315269, 1923385360, 209722667, 1546228260,
+ 168102420, 135274561, 355958469, 248291472, 2127839491, 146920100,
+ 585982612, 1611702337, 696506029, 1386498192, 1258072451, 1212240548,
+ 1043171860, 1217404993, 1090770605, 1386498193, 169093201, 541098240,
+ 1468005469, 456510673, 1578687785, 1838217424, 2010752065, 2089828354,
+ 1362717428, 970073673, 854129835, 714793201, 1266069081, 1047060864,
+ 1991471829, 1098097741, 913883585, 1669598224, 1337918685, 1219264706,
+ 1799741108, 1834116681, 683417731, 1120274457, 1073098457, 1648396544,
+ 176642749, 31171789, 718317889, 1266977808, 1400892508, 549749008,
+ 1808010512, 67112961, 1005669825, 903663673, 1771104465, 1277749632,
+ 1229754427, 950632997, 1979371465, 2074373264, 305357524, 1049387408,
+ 1171033360, 1686114305, 2147468765, 1941195985, 117709841, 809550080,
+ 991480851, 1816248997, 1561503561, 329575568, 780651196, 1659144592,
+ 1910793616, 604016641, 1665084765, 1530186961, 1870928913, 809550081,
+ 2079346113, 71307521, 876663040, 1073807360, 832356664, 1573927377,
+ 204073344, 2026918147, 1702476788, 2043881033, 57949587, 2001393952,
+ 1197426649, 1186508931, 332056865, 950043140, 890043474, 349099312,
+ 148914948, 236204097, 2022643605, 1441981517, 498130129, 1443421481,
+ 924216797, 1817491777, 1913146664, 1411989632, 929068432, 495735097,
+ 1684636033, 1284520017, 432816184, 1344884865, 210843729, 676364544,
+ 234449232, 12112337, 1350619139, 1753272996, 2037118872, 1408560528,
+ 533334916, 1043640385, 357326099, 201376421, 110375493, 541106497,
+ 416159637, 242512193, 777294080, 1614872576, 1535546636, 870600145,
+ 910810409, 1821440209, 1605432464, 1145147393, 951695441, 1758494976,
+ 1506656568, 1557150160, 608221521, 1073840384, 217672017, 684818688,
+ 1750138880, 16777217, 677990609, 953274371, 1770050213, 1359128393,
+ 1797602707, 1984616737, 1865815816, 2120835200, 2051677060, 1772234061,
+ 1579794881, 1652821009, 1742099468, 1887260865, 46468113, 1011925248,
+ 1134107920, 881643832, 1354774993, 472508800, 1892499769, 1752793472,
+ 1962502272, 687898625, 883538000, 1354355153, 1761673473, 944820481,
+ 2020102353, 22020353, 961597696, 1342242816, 964808962, 1355809701,
+ 17016649, 1386540177, 647682692, 1849012289, 751668241, 1557184768,
+ 127374604, 1927564752, 1045744913, 1614921984, 43588881, 1016185088,
+ 1544617984, 1090519041, 136122424, 215038417, 1563027841, 2026918145,
+ 1688778833, 701530369, 1372639488, 1342242817, 2036945104, 953274369,
+ 1750192384, 16842753, 964808960, 1359020032, 1358954497
+};
+
+// Note: This is pre-transposed, i.e. stored column-major order
+static const int8_t R64T[64][64] = {
+ {
+ 32, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 42,
+ 42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 36, 35, 34, 34, 33,
+ 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+ 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1,
+ }, {
+ 32, 45, 45, 44, 43, 42, 41, 39, 38, 36, 34, 31, 29, 26, 23, 20,
+ 17, 14, 11, 8, 4, 1, -2, -6, -9, -12, -15, -18, -21, -24, -27, -30,
+ -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43,
+ -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10, -7, -3,
+ }, {
+ 32, 45, 44, 42, 40, 37, 34, 30, 25, 20, 15, 10, 4, -1, -7, -12,
+ -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36,
+ -32, -28, -23, -18, -13, -8, -2, 3, 9, 14, 19, 24, 29, 33, 36, 39,
+ 42, 44, 45, 45, 45, 44, 43, 40, 38, 34, 30, 26, 21, 16, 11, 6,
+ }, {
+ 32, 45, 43, 39, 35, 30, 23, 16, 9, 1, -7, -14, -21, -28, -34, -38,
+ -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11, -3, 4, 12, 19, 26,
+ 32, 37, 41, 44, 45, 45, 44, 41, 38, 33, 27, 20, 13, 6, -2, -10,
+ -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15, -8,
+ }, {
+ 32, 44, 41, 36, 29, 20, 11, 1, -9, -18, -27, -34, -40, -44, -45, -45,
+ -42, -37, -30, -22, -13, -3, 7, 16, 25, 33, 39, 43, 45, 45, 43, 38,
+ 32, 24, 15, 6, -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26,
+ -17, -8, 2, 12, 21, 30, 36, 41, 44, 45, 44, 40, 35, 28, 19, 10,
+ }, {
+ 32, 44, 39, 31, 21, 10, -2, -14, -25, -34, -41, -45, -45, -42, -36, -28,
+ -17, -6, 7, 18, 29, 37, 43, 45, 44, 40, 34, 24, 13, 1, -11, -22,
+ -32, -39, -44, -45, -43, -38, -30, -20, -9, 3, 15, 26, 35, 41, 45, 45,
+ 42, 36, 27, 16, 4, -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12,
+ }, {
+ 32, 43, 36, 26, 13, -1, -15, -28, -38, -44, -45, -42, -35, -24, -11, 3,
+ 17, 30, 39, 44, 45, 41, 34, 22, 9, -6, -19, -31, -40, -45, -45, -40,
+ -32, -20, -7, 8, 21, 33, 41, 45, 44, 39, 30, 18, 4, -10, -23, -34,
+ -42, -45, -44, -38, -29, -16, -2, 12, 25, 36, 43, 45, 43, 37, 27, 14,
+ }, {
+ 32, 42, 34, 20, 4, -12, -27, -38, -44, -45, -39, -28, -13, 3, 19, 33,
+ 42, 45, 43, 34, 21, 6, -11, -26, -38, -44, -45, -39, -29, -14, 2, 18,
+ 32, 41, 45, 43, 35, 22, 7, -10, -25, -37, -44, -45, -40, -30, -15, 1,
+ 17, 31, 41, 45, 43, 36, 23, 8, -9, -24, -36, -44, -45, -40, -30, -16,
+ }, {
+ 32, 41, 30, 14, -4, -22, -36, -44, -44, -37, -23, -6, 13, 30, 41, 45,
+ 42, 31, 15, -3, -21, -36, -44, -45, -38, -24, -7, 12, 29, 40, 45, 42,
+ 32, 16, -2, -20, -35, -44, -45, -38, -25, -8, 11, 28, 40, 45, 43, 33,
+ 17, -1, -19, -34, -43, -45, -39, -26, -9, 10, 27, 39, 45, 43, 34, 18,
+ }, {
+ 32, 40, 27, 8, -13, -31, -43, -45, -38, -22, -2, 18, 35, 44, 44, 34,
+ 17, -3, -23, -38, -45, -42, -30, -12, 9, 28, 41, 45, 40, 26, 7, -14,
+ -32, -43, -45, -37, -21, -1, 19, 36, 44, 44, 34, 16, -4, -24, -39, -45,
+ -42, -30, -11, 10, 29, 41, 45, 39, 25, 6, -15, -33, -43, -45, -36, -20,
+ }, {
+ 32, 39, 23, 1, -21, -38, -45, -40, -25, -3, 19, 37, 45, 41, 27, 6,
+ -17, -36, -45, -42, -29, -8, 15, 34, 44, 43, 30, 10, -13, -33, -44, -44,
+ -32, -12, 11, 31, 43, 44, 34, 14, -9, -30, -43, -45, -35, -16, 7, 28,
+ 42, 45, 36, 18, -4, -26, -41, -45, -38, -20, 2, 24, 40, 45, 39, 22,
+ }, {
+ 32, 38, 19, -6, -29, -43, -44, -31, -9, 16, 36, 45, 40, 22, -2, -26,
+ -42, -45, -34, -12, 13, 34, 45, 41, 25, 1, -23, -40, -45, -36, -15, 10,
+ 32, 44, 43, 28, 4, -20, -39, -45, -38, -18, 7, 30, 43, 44, 30, 8,
+ -17, -37, -45, -39, -21, 3, 27, 42, 44, 33, 11, -14, -35, -45, -41, -24,
+ }, {
+ 32, 37, 15, -12, -35, -45, -39, -18, 9, 33, 45, 40, 21, -6, -30, -44,
+ -42, -24, 2, 28, 43, 43, 27, 1, -25, -42, -44, -30, -4, 22, 41, 45,
+ 32, 8, -19, -39, -45, -34, -11, 16, 38, 45, 36, 14, -13, -36, -45, -38,
+ -17, 10, 34, 45, 40, 20, -7, -31, -44, -41, -23, 3, 29, 44, 43, 26,
+ }, {
+ 32, 36, 11, -18, -40, -45, -30, -3, 25, 43, 43, 24, -4, -31, -45, -39,
+ -17, 12, 36, 45, 35, 10, -19, -40, -44, -30, -2, 26, 43, 42, 23, -6,
+ -32, -45, -39, -16, 13, 37, 45, 34, 9, -20, -41, -44, -29, -1, 27, 44,
+ 42, 22, -7, -33, -45, -38, -15, 14, 38, 45, 34, 8, -21, -41, -44, -28,
+ }, {
+ 32, 34, 7, -24, -43, -41, -19, 12, 38, 45, 30, 1, -29, -45, -39, -14,
+ 17, 40, 44, 26, -4, -33, -45, -36, -9, 22, 43, 42, 21, -10, -36, -45,
+ -32, -3, 27, 44, 40, 16, -15, -39, -44, -28, 2, 31, 45, 37, 11, -20,
+ -42, -43, -23, 8, 35, 45, 34, 6, -25, -44, -41, -18, 13, 38, 45, 30,
+ }, {
+ 32, 33, 2, -30, -45, -36, -7, 26, 44, 38, 11, -22, -43, -40, -15, 18,
+ 42, 42, 19, -14, -40, -44, -23, 10, 38, 45, 27, -6, -35, -45, -30, 1,
+ 32, 45, 34, 3, -29, -45, -36, -8, 25, 44, 39, 12, -21, -43, -41, -16,
+ 17, 41, 43, 20, -13, -39, -44, -24, 9, 37, 45, 28, -4, -34, -45, -31,
+ }, {
+ 32, 31, -2, -34, -45, -28, 7, 37, 44, 24, -11, -39, -43, -20, 15, 41,
+ 42, 16, -19, -43, -40, -12, 23, 44, 38, 8, -27, -45, -35, -3, 30, 45,
+ 32, -1, -34, -45, -29, 6, 36, 45, 25, -10, -39, -44, -21, 14, 41, 42,
+ 17, -18, -43, -40, -13, 22, 44, 38, 9, -26, -45, -36, -4, 30, 45, 33,
+ }, {
+ 32, 30, -7, -38, -43, -18, 19, 44, 38, 6, -30, -45, -29, 8, 39, 43,
+ 17, -20, -44, -37, -4, 31, 45, 28, -9, -39, -43, -16, 21, 44, 36, 3,
+ -32, -45, -27, 10, 40, 42, 15, -22, -44, -36, -2, 33, 45, 26, -11, -40,
+ -42, -14, 23, 45, 35, 1, -34, -45, -25, 12, 41, 41, 13, -24, -45, -34,
+ }, {
+ 32, 28, -11, -41, -40, -8, 30, 45, 25, -14, -43, -38, -4, 33, 45, 22,
+ -17, -44, -36, -1, 35, 44, 19, -20, -44, -34, 2, 37, 43, 16, -23, -45,
+ -32, 6, 39, 42, 13, -26, -45, -30, 9, 40, 41, 10, -29, -45, -27, 12,
+ 42, 39, 7, -31, -45, -24, 15, 43, 38, 3, -34, -45, -21, 18, 44, 36,
+ }, {
+ 32, 26, -15, -44, -35, 3, 39, 41, 9, -31, -45, -20, 21, 45, 30, -10,
+ -42, -38, -2, 36, 43, 14, -27, -45, -25, 16, 44, 34, -4, -39, -41, -8,
+ 32, 45, 19, -22, -45, -30, 11, 42, 38, 1, -36, -43, -13, 28, 45, 24,
+ -17, -44, -34, 6, 40, 40, 7, -33, -44, -18, 23, 45, 29, -12, -43, -37,
+ }, {
+ 32, 24, -19, -45, -29, 14, 44, 33, -9, -42, -36, 3, 40, 39, 2, -37,
+ -42, -8, 34, 44, 13, -30, -45, -18, 25, 45, 23, -20, -45, -28, 15, 44,
+ 32, -10, -43, -36, 4, 40, 39, 1, -38, -41, -7, 34, 43, 12, -30, -45,
+ -17, 26, 45, 22, -21, -45, -27, 16, 44, 31, -11, -43, -35, 6, 41, 38,
+ }, {
+ 32, 22, -23, -45, -21, 24, 45, 20, -25, -45, -19, 26, 45, 18, -27, -45,
+ -17, 28, 45, 16, -29, -45, -15, 30, 44, 14, -30, -44, -13, 31, 44, 12,
+ -32, -44, -11, 33, 43, 10, -34, -43, -9, 34, 43, 8, -35, -42, -7, 36,
+ 42, 6, -36, -41, -4, 37, 41, 3, -38, -40, -2, 38, 40, 1, -39, -39,
+ }, {
+ 32, 20, -27, -45, -13, 33, 43, 6, -38, -39, 2, 41, 35, -10, -44, -30,
+ 17, 45, 23, -24, -45, -16, 30, 44, 9, -36, -41, -1, 40, 37, -7, -43,
+ -32, 14, 45, 26, -21, -45, -19, 28, 44, 12, -34, -42, -4, 38, 39, -3,
+ -42, -34, 11, 44, 29, -18, -45, -22, 25, 45, 15, -31, -43, -8, 36, 40,
+ }, {
+ 32, 18, -30, -43, -4, 39, 36, -10, -44, -26, 23, 45, 13, -34, -41, 1,
+ 42, 33, -15, -45, -21, 28, 44, 8, -38, -38, 7, 44, 29, -20, -45, -16,
+ 32, 42, 2, -40, -35, 12, 45, 24, -25, -45, -11, 36, 40, -3, -43, -31,
+ 17, 45, 19, -30, -43, -6, 39, 37, -9, -44, -27, 22, 45, 14, -34, -41,
+ }, {
+ 32, 16, -34, -40, 4, 44, 27, -24, -44, -8, 39, 36, -13, -45, -19, 31,
+ 42, -1, -43, -30, 21, 45, 11, -37, -38, 10, 45, 22, -29, -43, -2, 41,
+ 32, -18, -45, -14, 35, 39, -7, -44, -25, 26, 44, 6, -40, -34, 15, 45,
+ 17, -33, -41, 3, 43, 28, -23, -45, -9, 38, 36, -12, -45, -20, 30, 42,
+ }, {
+ 32, 14, -36, -37, 13, 45, 15, -36, -38, 12, 45, 16, -35, -38, 11, 45,
+ 17, -34, -39, 10, 45, 18, -34, -39, 9, 45, 19, -33, -40, 8, 45, 20,
+ -32, -40, 7, 45, 21, -31, -41, 6, 44, 22, -30, -41, 4, 44, 23, -30,
+ -42, 3, 44, 24, -29, -42, 2, 44, 25, -28, -43, 1, 43, 26, -27, -43,
+ }, {
+ 32, 12, -39, -33, 21, 44, 2, -43, -25, 30, 41, -8, -45, -16, 36, 36,
+ -17, -45, -7, 41, 29, -26, -43, 3, 44, 20, -34, -38, 13, 45, 11, -39,
+ -32, 22, 44, 1, -43, -24, 30, 40, -9, -45, -15, 37, 35, -18, -45, -6,
+ 42, 28, -27, -42, 4, 45, 19, -34, -38, 14, 45, 10, -40, -31, 23, 44,
+ }, {
+ 32, 10, -41, -28, 29, 40, -11, -45, -9, 41, 27, -30, -40, 12, 45, 8,
+ -42, -26, 30, 39, -13, -45, -7, 42, 25, -31, -39, 14, 45, 6, -43, -24,
+ 32, 38, -15, -45, -4, 43, 23, -33, -38, 16, 45, 3, -43, -22, 34, 37,
+ -17, -45, -2, 44, 21, -34, -36, 18, 44, 1, -44, -20, 35, 36, -19, -44,
+ }, {
+ 32, 8, -43, -22, 35, 34, -23, -42, 9, 45, 7, -43, -21, 36, 34, -24,
+ -42, 10, 45, 6, -43, -20, 36, 33, -25, -41, 11, 45, 4, -44, -19, 37,
+ 32, -26, -41, 12, 45, 3, -44, -18, 38, 31, -27, -40, 13, 45, 2, -44,
+ -17, 38, 30, -28, -40, 14, 45, 1, -44, -16, 39, 30, -29, -39, 15, 45,
+ }, {
+ 32, 6, -44, -16, 40, 26, -34, -34, 25, 40, -15, -44, 4, 45, 7, -44,
+ -17, 39, 27, -33, -35, 24, 41, -14, -44, 3, 45, 8, -43, -18, 39, 28,
+ -32, -36, 23, 41, -13, -45, 2, 45, 9, -43, -19, 38, 29, -31, -36, 22,
+ 42, -12, -45, 1, 45, 10, -43, -20, 38, 30, -30, -37, 21, 42, -11, -45,
+ }, {
+ 32, 3, -45, -10, 43, 16, -41, -22, 38, 28, -34, -33, 29, 37, -23, -40,
+ 17, 43, -11, -45, 4, 45, 2, -45, -9, 44, 15, -41, -21, 38, 27, -34,
+ -32, 30, 36, -24, -40, 18, 43, -12, -44, 6, 45, 1, -45, -8, 44, 14,
+ -42, -20, 39, 26, -35, -31, 30, 36, -25, -39, 19, 42, -13, -44, 7, 45,
+ }, {
+ 32, 1, -45, -3, 45, 6, -45, -8, 44, 10, -44, -12, 43, 14, -43, -16,
+ 42, 18, -41, -20, 40, 22, -39, -24, 38, 26, -36, -28, 35, 30, -34, -31,
+ 32, 33, -30, -34, 29, 36, -27, -37, 25, 38, -23, -39, 21, 40, -19, -41,
+ 17, 42, -15, -43, 13, 44, -11, -44, 9, 45, -7, -45, 4, 45, -2, -45,
+ }, {
+ 32, -1, -45, 3, 45, -6, -45, 8, 44, -10, -44, 12, 43, -14, -43, 16,
+ 42, -18, -41, 20, 40, -22, -39, 24, 38, -26, -36, 28, 35, -30, -34, 31,
+ 32, -33, -30, 34, 29, -36, -27, 37, 25, -38, -23, 39, 21, -40, -19, 41,
+ 17, -42, -15, 43, 13, -44, -11, 44, 9, -45, -7, 45, 4, -45, -2, 45,
+ }, {
+ 32, -3, -45, 10, 43, -16, -41, 22, 38, -28, -34, 33, 29, -37, -23, 40,
+ 17, -43, -11, 45, 4, -45, 2, 45, -9, -44, 15, 41, -21, -38, 27, 34,
+ -32, -30, 36, 24, -40, -18, 43, 12, -44, -6, 45, -1, -45, 8, 44, -14,
+ -42, 20, 39, -26, -35, 31, 30, -36, -25, 39, 19, -42, -13, 44, 7, -45,
+ }, {
+ 32, -6, -44, 16, 40, -26, -34, 34, 25, -40, -15, 44, 4, -45, 7, 44,
+ -17, -39, 27, 33, -35, -24, 41, 14, -44, -3, 45, -8, -43, 18, 39, -28,
+ -32, 36, 23, -41, -13, 45, 2, -45, 9, 43, -19, -38, 29, 31, -36, -22,
+ 42, 12, -45, -1, 45, -10, -43, 20, 38, -30, -30, 37, 21, -42, -11, 45,
+ }, {
+ 32, -8, -43, 22, 35, -34, -23, 42, 9, -45, 7, 43, -21, -36, 34, 24,
+ -42, -10, 45, -6, -43, 20, 36, -33, -25, 41, 11, -45, 4, 44, -19, -37,
+ 32, 26, -41, -12, 45, -3, -44, 18, 38, -31, -27, 40, 13, -45, 2, 44,
+ -17, -38, 30, 28, -40, -14, 45, -1, -44, 16, 39, -30, -29, 39, 15, -45,
+ }, {
+ 32, -10, -41, 28, 29, -40, -11, 45, -9, -41, 27, 30, -40, -12, 45, -8,
+ -42, 26, 30, -39, -13, 45, -7, -42, 25, 31, -39, -14, 45, -6, -43, 24,
+ 32, -38, -15, 45, -4, -43, 23, 33, -38, -16, 45, -3, -43, 22, 34, -37,
+ -17, 45, -2, -44, 21, 34, -36, -18, 44, -1, -44, 20, 35, -36, -19, 44,
+ }, {
+ 32, -12, -39, 33, 21, -44, 2, 43, -25, -30, 41, 8, -45, 16, 36, -36,
+ -17, 45, -7, -41, 29, 26, -43, -3, 44, -20, -34, 38, 13, -45, 11, 39,
+ -32, -22, 44, -1, -43, 24, 30, -40, -9, 45, -15, -37, 35, 18, -45, 6,
+ 42, -28, -27, 42, 4, -45, 19, 34, -38, -14, 45, -10, -40, 31, 23, -44,
+ }, {
+ 32, -14, -36, 37, 13, -45, 15, 36, -38, -12, 45, -16, -35, 38, 11, -45,
+ 17, 34, -39, -10, 45, -18, -34, 39, 9, -45, 19, 33, -40, -8, 45, -20,
+ -32, 40, 7, -45, 21, 31, -41, -6, 44, -22, -30, 41, 4, -44, 23, 30,
+ -42, -3, 44, -24, -29, 42, 2, -44, 25, 28, -43, -1, 43, -26, -27, 43,
+ }, {
+ 32, -16, -34, 40, 4, -44, 27, 24, -44, 8, 39, -36, -13, 45, -19, -31,
+ 42, 1, -43, 30, 21, -45, 11, 37, -38, -10, 45, -22, -29, 43, -2, -41,
+ 32, 18, -45, 14, 35, -39, -7, 44, -25, -26, 44, -6, -40, 34, 15, -45,
+ 17, 33, -41, -3, 43, -28, -23, 45, -9, -38, 36, 12, -45, 20, 30, -42,
+ }, {
+ 32, -18, -30, 43, -4, -39, 36, 10, -44, 26, 23, -45, 13, 34, -41, -1,
+ 42, -33, -15, 45, -21, -28, 44, -8, -38, 38, 7, -44, 29, 20, -45, 16,
+ 32, -42, 2, 40, -35, -12, 45, -24, -25, 45, -11, -36, 40, 3, -43, 31,
+ 17, -45, 19, 30, -43, 6, 39, -37, -9, 44, -27, -22, 45, -14, -34, 41,
+ }, {
+ 32, -20, -27, 45, -13, -33, 43, -6, -38, 39, 2, -41, 35, 10, -44, 30,
+ 17, -45, 23, 24, -45, 16, 30, -44, 9, 36, -41, 1, 40, -37, -7, 43,
+ -32, -14, 45, -26, -21, 45, -19, -28, 44, -12, -34, 42, -4, -38, 39, 3,
+ -42, 34, 11, -44, 29, 18, -45, 22, 25, -45, 15, 31, -43, 8, 36, -40,
+ }, {
+ 32, -22, -23, 45, -21, -24, 45, -20, -25, 45, -19, -26, 45, -18, -27, 45,
+ -17, -28, 45, -16, -29, 45, -15, -30, 44, -14, -30, 44, -13, -31, 44, -12,
+ -32, 44, -11, -33, 43, -10, -34, 43, -9, -34, 43, -8, -35, 42, -7, -36,
+ 42, -6, -36, 41, -4, -37, 41, -3, -38, 40, -2, -38, 40, -1, -39, 39,
+ }, {
+ 32, -24, -19, 45, -29, -14, 44, -33, -9, 42, -36, -3, 40, -39, 2, 37,
+ -42, 8, 34, -44, 13, 30, -45, 18, 25, -45, 23, 20, -45, 28, 15, -44,
+ 32, 10, -43, 36, 4, -40, 39, -1, -38, 41, -7, -34, 43, -12, -30, 45,
+ -17, -26, 45, -22, -21, 45, -27, -16, 44, -31, -11, 43, -35, -6, 41, -38,
+ }, {
+ 32, -26, -15, 44, -35, -3, 39, -41, 9, 31, -45, 20, 21, -45, 30, 10,
+ -42, 38, -2, -36, 43, -14, -27, 45, -25, -16, 44, -34, -4, 39, -41, 8,
+ 32, -45, 19, 22, -45, 30, 11, -42, 38, -1, -36, 43, -13, -28, 45, -24,
+ -17, 44, -34, -6, 40, -40, 7, 33, -44, 18, 23, -45, 29, 12, -43, 37,
+ }, {
+ 32, -28, -11, 41, -40, 8, 30, -45, 25, 14, -43, 38, -4, -33, 45, -22,
+ -17, 44, -36, 1, 35, -44, 19, 20, -44, 34, 2, -37, 43, -16, -23, 45,
+ -32, -6, 39, -42, 13, 26, -45, 30, 9, -40, 41, -10, -29, 45, -27, -12,
+ 42, -39, 7, 31, -45, 24, 15, -43, 38, -3, -34, 45, -21, -18, 44, -36,
+ }, {
+ 32, -30, -7, 38, -43, 18, 19, -44, 38, -6, -30, 45, -29, -8, 39, -43,
+ 17, 20, -44, 37, -4, -31, 45, -28, -9, 39, -43, 16, 21, -44, 36, -3,
+ -32, 45, -27, -10, 40, -42, 15, 22, -44, 36, -2, -33, 45, -26, -11, 40,
+ -42, 14, 23, -45, 35, -1, -34, 45, -25, -12, 41, -41, 13, 24, -45, 34,
+ }, {
+ 32, -31, -2, 34, -45, 28, 7, -37, 44, -24, -11, 39, -43, 20, 15, -41,
+ 42, -16, -19, 43, -40, 12, 23, -44, 38, -8, -27, 45, -35, 3, 30, -45,
+ 32, 1, -34, 45, -29, -6, 36, -45, 25, 10, -39, 44, -21, -14, 41, -42,
+ 17, 18, -43, 40, -13, -22, 44, -38, 9, 26, -45, 36, -4, -30, 45, -33,
+ }, {
+ 32, -33, 2, 30, -45, 36, -7, -26, 44, -38, 11, 22, -43, 40, -15, -18,
+ 42, -42, 19, 14, -40, 44, -23, -10, 38, -45, 27, 6, -35, 45, -30, -1,
+ 32, -45, 34, -3, -29, 45, -36, 8, 25, -44, 39, -12, -21, 43, -41, 16,
+ 17, -41, 43, -20, -13, 39, -44, 24, 9, -37, 45, -28, -4, 34, -45, 31,
+ }, {
+ 32, -34, 7, 24, -43, 41, -19, -12, 38, -45, 30, -1, -29, 45, -39, 14,
+ 17, -40, 44, -26, -4, 33, -45, 36, -9, -22, 43, -42, 21, 10, -36, 45,
+ -32, 3, 27, -44, 40, -16, -15, 39, -44, 28, 2, -31, 45, -37, 11, 20,
+ -42, 43, -23, -8, 35, -45, 34, -6, -25, 44, -41, 18, 13, -38, 45, -30,
+ }, {
+ 32, -36, 11, 18, -40, 45, -30, 3, 25, -43, 43, -24, -4, 31, -45, 39,
+ -17, -12, 36, -45, 35, -10, -19, 40, -44, 30, -2, -26, 43, -42, 23, 6,
+ -32, 45, -39, 16, 13, -37, 45, -34, 9, 20, -41, 44, -29, 1, 27, -44,
+ 42, -22, -7, 33, -45, 38, -15, -14, 38, -45, 34, -8, -21, 41, -44, 28,
+ }, {
+ 32, -37, 15, 12, -35, 45, -39, 18, 9, -33, 45, -40, 21, 6, -30, 44,
+ -42, 24, 2, -28, 43, -43, 27, -1, -25, 42, -44, 30, -4, -22, 41, -45,
+ 32, -8, -19, 39, -45, 34, -11, -16, 38, -45, 36, -14, -13, 36, -45, 38,
+ -17, -10, 34, -45, 40, -20, -7, 31, -44, 41, -23, -3, 29, -44, 43, -26,
+ }, {
+ 32, -38, 19, 6, -29, 43, -44, 31, -9, -16, 36, -45, 40, -22, -2, 26,
+ -42, 45, -34, 12, 13, -34, 45, -41, 25, -1, -23, 40, -45, 36, -15, -10,
+ 32, -44, 43, -28, 4, 20, -39, 45, -38, 18, 7, -30, 43, -44, 30, -8,
+ -17, 37, -45, 39, -21, -3, 27, -42, 44, -33, 11, 14, -35, 45, -41, 24,
+ }, {
+ 32, -39, 23, -1, -21, 38, -45, 40, -25, 3, 19, -37, 45, -41, 27, -6,
+ -17, 36, -45, 42, -29, 8, 15, -34, 44, -43, 30, -10, -13, 33, -44, 44,
+ -32, 12, 11, -31, 43, -44, 34, -14, -9, 30, -43, 45, -35, 16, 7, -28,
+ 42, -45, 36, -18, -4, 26, -41, 45, -38, 20, 2, -24, 40, -45, 39, -22,
+ }, {
+ 32, -40, 27, -8, -13, 31, -43, 45, -38, 22, -2, -18, 35, -44, 44, -34,
+ 17, 3, -23, 38, -45, 42, -30, 12, 9, -28, 41, -45, 40, -26, 7, 14,
+ -32, 43, -45, 37, -21, 1, 19, -36, 44, -44, 34, -16, -4, 24, -39, 45,
+ -42, 30, -11, -10, 29, -41, 45, -39, 25, -6, -15, 33, -43, 45, -36, 20,
+ }, {
+ 32, -41, 30, -14, -4, 22, -36, 44, -44, 37, -23, 6, 13, -30, 41, -45,
+ 42, -31, 15, 3, -21, 36, -44, 45, -38, 24, -7, -12, 29, -40, 45, -42,
+ 32, -16, -2, 20, -35, 44, -45, 38, -25, 8, 11, -28, 40, -45, 43, -33,
+ 17, 1, -19, 34, -43, 45, -39, 26, -9, -10, 27, -39, 45, -43, 34, -18,
+ }, {
+ 32, -42, 34, -20, 4, 12, -27, 38, -44, 45, -39, 28, -13, -3, 19, -33,
+ 42, -45, 43, -34, 21, -6, -11, 26, -38, 44, -45, 39, -29, 14, 2, -18,
+ 32, -41, 45, -43, 35, -22, 7, 10, -25, 37, -44, 45, -40, 30, -15, -1,
+ 17, -31, 41, -45, 43, -36, 23, -8, -9, 24, -36, 44, -45, 40, -30, 16,
+ }, {
+ 32, -43, 36, -26, 13, 1, -15, 28, -38, 44, -45, 42, -35, 24, -11, -3,
+ 17, -30, 39, -44, 45, -41, 34, -22, 9, 6, -19, 31, -40, 45, -45, 40,
+ -32, 20, -7, -8, 21, -33, 41, -45, 44, -39, 30, -18, 4, 10, -23, 34,
+ -42, 45, -44, 38, -29, 16, -2, -12, 25, -36, 43, -45, 43, -37, 27, -14,
+ }, {
+ 32, -44, 39, -31, 21, -10, -2, 14, -25, 34, -41, 45, -45, 42, -36, 28,
+ -17, 6, 7, -18, 29, -37, 43, -45, 44, -40, 34, -24, 13, -1, -11, 22,
+ -32, 39, -44, 45, -43, 38, -30, 20, -9, -3, 15, -26, 35, -41, 45, -45,
+ 42, -36, 27, -16, 4, 8, -19, 30, -38, 43, -45, 44, -40, 33, -23, 12,
+ }, {
+ 32, -44, 41, -36, 29, -20, 11, -1, -9, 18, -27, 34, -40, 44, -45, 45,
+ -42, 37, -30, 22, -13, 3, 7, -16, 25, -33, 39, -43, 45, -45, 43, -38,
+ 32, -24, 15, -6, -4, 14, -23, 31, -38, 42, -45, 45, -43, 39, -34, 26,
+ -17, 8, 2, -12, 21, -30, 36, -41, 44, -45, 44, -40, 35, -28, 19, -10,
+ }, {
+ 32, -45, 43, -39, 35, -30, 23, -16, 9, -1, -7, 14, -21, 28, -34, 38,
+ -42, 44, -45, 45, -43, 40, -36, 31, -25, 18, -11, 3, 4, -12, 19, -26,
+ 32, -37, 41, -44, 45, -45, 44, -41, 38, -33, 27, -20, 13, -6, -2, 10,
+ -17, 24, -30, 36, -40, 43, -45, 45, -44, 42, -39, 34, -29, 22, -15, 8,
+ }, {
+ 32, -45, 44, -42, 40, -37, 34, -30, 25, -20, 15, -10, 4, 1, -7, 12,
+ -17, 22, -27, 31, -35, 38, -41, 43, -44, 45, -45, 45, -43, 41, -39, 36,
+ -32, 28, -23, 18, -13, 8, -2, -3, 9, -14, 19, -24, 29, -33, 36, -39,
+ 42, -44, 45, -45, 45, -44, 43, -40, 38, -34, 30, -26, 21, -16, 11, -6,
+ }, {
+ 32, -45, 45, -44, 43, -42, 41, -39, 38, -36, 34, -31, 29, -26, 23, -20,
+ 17, -14, 11, -8, 4, -1, -2, 6, -9, 12, -15, 18, -21, 24, -27, 30,
+ -32, 34, -36, 38, -40, 41, -43, 44, -44, 45, -45, 45, -45, 45, -44, 43,
+ -42, 40, -39, 37, -35, 33, -30, 28, -25, 22, -19, 16, -13, 10, -7, 3,
+ }, {
+ 32, -45, 45, -45, 45, -45, 45, -45, 44, -44, 44, -44, 43, -43, 43, -42,
+ 42, -41, 41, -40, 40, -39, 39, -38, 38, -37, 36, -36, 35, -34, 34, -33,
+ 32, -31, 30, -30, 29, -28, 27, -26, 25, -24, 23, -22, 21, -20, 19, -18,
+ 17, -16, 15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 4, -3, 2, -1,
+ }
+};
diff --git a/src/shaders/icc.c b/src/shaders/icc.c
new file mode 100644
index 0000000..6a16cfd
--- /dev/null
+++ b/src/shaders/icc.c
@@ -0,0 +1,781 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/tone_mapping.h>
+#include <libplacebo/shaders/icc.h>
+
+const struct pl_icc_params pl_icc_default_params = { PL_ICC_DEFAULTS };
+
+#ifdef PL_HAVE_LCMS
+
+#include <lcms2.h>
+#include <lcms2_plugin.h>
+
+struct icc_priv {
+ pl_log log;
+ pl_cache cache; // for backwards compatibility
+ cmsContext cms;
+ cmsHPROFILE profile;
+ cmsHPROFILE approx; // approximation profile
+ float a, b, scale; // approxmation tone curve parameters and scaling
+ cmsCIEXYZ black;
+ float gamma_stddev;
+ uint64_t lut_sig;
+};
+
+static void error_callback(cmsContext cms, cmsUInt32Number code,
+ const char *msg)
+{
+ pl_log log = cmsGetContextUserData(cms);
+ pl_err(log, "lcms2: [%d] %s", (int) code, msg);
+}
+
+static void set_callback(void *priv, pl_cache_obj obj)
+{
+ pl_icc_object icc = priv;
+ icc->params.cache_save(icc->params.cache_priv, obj.key, obj.data, obj.size);
+}
+
+static pl_cache_obj get_callback(void *priv, uint64_t key)
+{
+ pl_icc_object icc = priv;
+ int s_r = icc->params.size_r, s_g = icc->params.size_g, s_b = icc->params.size_b;
+ size_t data_size = s_r * s_g * s_b * sizeof(uint16_t[4]);
+ void *data = pl_alloc(NULL, data_size);
+ bool ok = icc->params.cache_load(icc->params.cache_priv, key, data, data_size);
+ if (!ok) {
+ pl_free(data);
+ return (pl_cache_obj) {0};
+ }
+
+ return (pl_cache_obj) {
+ .key = key,
+ .data = data,
+ .size = data_size,
+ .free = pl_free,
+ };
+}
+
+void pl_icc_close(pl_icc_object *picc)
+{
+ pl_icc_object icc = *picc;
+ if (!icc)
+ return;
+
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsCloseProfile(p->approx);
+ cmsCloseProfile(p->profile);
+ cmsDeleteContext(p->cms);
+ pl_cache_destroy(&p->cache);
+ pl_free_ptr((void **) picc);
+}
+
+static bool detect_csp(pl_icc_object icc, struct pl_raw_primaries *prim,
+ float *out_gamma)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsHTRANSFORM tf;
+ cmsHPROFILE xyz = cmsCreateXYZProfileTHR(p->cms);
+ if (!xyz)
+ return false;
+
+ // We need to use an unadapted observer to get the raw values
+ cmsFloat64Number prev_adapt = cmsSetAdaptationStateTHR(p->cms, 0.0);
+ tf = cmsCreateTransformTHR(p->cms, p->profile, TYPE_RGB_8, xyz, TYPE_XYZ_DBL,
+ INTENT_ABSOLUTE_COLORIMETRIC,
+ /* Note: These flags mostly don't do anything
+ * anyway, but specify them regardless */
+ cmsFLAGS_NOCACHE |
+ cmsFLAGS_NOOPTIMIZE);
+ cmsSetAdaptationStateTHR(p->cms, prev_adapt);
+ cmsCloseProfile(xyz);
+ if (!tf)
+ return false;
+
+ enum {
+ RED,
+ GREEN,
+ BLUE,
+ WHITE,
+ BLACK,
+ GRAY,
+ RAMP,
+ };
+
+ static const uint8_t test[][3] = {
+ [RED] = { 0xFF, 0, 0 },
+ [GREEN] = { 0, 0xFF, 0 },
+ [BLUE] = { 0, 0, 0xFF },
+ [WHITE] = { 0xFF, 0xFF, 0xFF },
+ [BLACK] = { 0x00, 0x00, 0x00 },
+ [GRAY] = { 0x80, 0x80, 0x80 },
+
+ // Grayscale ramp (excluding endpoints)
+#define V(d) { d, d, d }
+ V(0x01), V(0x02), V(0x03), V(0x04), V(0x05), V(0x06), V(0x07),
+ V(0x08), V(0x09), V(0x0A), V(0x0B), V(0x0C), V(0x0D), V(0x0E), V(0x0F),
+ V(0x10), V(0x11), V(0x12), V(0x13), V(0x14), V(0x15), V(0x16), V(0x17),
+ V(0x18), V(0x19), V(0x1A), V(0x1B), V(0x1C), V(0x1D), V(0x1E), V(0x1F),
+ V(0x20), V(0x21), V(0x22), V(0x23), V(0x24), V(0x25), V(0x26), V(0x27),
+ V(0x28), V(0x29), V(0x2A), V(0x2B), V(0x2C), V(0x2D), V(0x2E), V(0x2F),
+ V(0x30), V(0x31), V(0x32), V(0x33), V(0x34), V(0x35), V(0x36), V(0x37),
+ V(0x38), V(0x39), V(0x3A), V(0x3B), V(0x3C), V(0x3D), V(0x3E), V(0x3F),
+ V(0x40), V(0x41), V(0x42), V(0x43), V(0x44), V(0x45), V(0x46), V(0x47),
+ V(0x48), V(0x49), V(0x4A), V(0x4B), V(0x4C), V(0x4D), V(0x4E), V(0x4F),
+ V(0x50), V(0x51), V(0x52), V(0x53), V(0x54), V(0x55), V(0x56), V(0x57),
+ V(0x58), V(0x59), V(0x5A), V(0x5B), V(0x5C), V(0x5D), V(0x5E), V(0x5F),
+ V(0x60), V(0x61), V(0x62), V(0x63), V(0x64), V(0x65), V(0x66), V(0x67),
+ V(0x68), V(0x69), V(0x6A), V(0x6B), V(0x6C), V(0x6D), V(0x6E), V(0x6F),
+ V(0x70), V(0x71), V(0x72), V(0x73), V(0x74), V(0x75), V(0x76), V(0x77),
+ V(0x78), V(0x79), V(0x7A), V(0x7B), V(0x7C), V(0x7D), V(0x7E), V(0x7F),
+ V(0x80), V(0x81), V(0x82), V(0x83), V(0x84), V(0x85), V(0x86), V(0x87),
+ V(0x88), V(0x89), V(0x8A), V(0x8B), V(0x8C), V(0x8D), V(0x8E), V(0x8F),
+ V(0x90), V(0x91), V(0x92), V(0x93), V(0x94), V(0x95), V(0x96), V(0x97),
+ V(0x98), V(0x99), V(0x9A), V(0x9B), V(0x9C), V(0x9D), V(0x9E), V(0x9F),
+ V(0xA0), V(0xA1), V(0xA2), V(0xA3), V(0xA4), V(0xA5), V(0xA6), V(0xA7),
+ V(0xA8), V(0xA9), V(0xAA), V(0xAB), V(0xAC), V(0xAD), V(0xAE), V(0xAF),
+ V(0xB0), V(0xB1), V(0xB2), V(0xB3), V(0xB4), V(0xB5), V(0xB6), V(0xB7),
+ V(0xB8), V(0xB9), V(0xBA), V(0xBB), V(0xBC), V(0xBD), V(0xBE), V(0xBF),
+ V(0xC0), V(0xC1), V(0xC2), V(0xC3), V(0xC4), V(0xC5), V(0xC6), V(0xC7),
+ V(0xC8), V(0xC9), V(0xCA), V(0xCB), V(0xCC), V(0xCD), V(0xCE), V(0xCF),
+ V(0xD0), V(0xD1), V(0xD2), V(0xD3), V(0xD4), V(0xD5), V(0xD6), V(0xD7),
+ V(0xD8), V(0xD9), V(0xDA), V(0xDB), V(0xDC), V(0xDD), V(0xDE), V(0xDF),
+ V(0xE0), V(0xE1), V(0xE2), V(0xE3), V(0xE4), V(0xE5), V(0xE6), V(0xE7),
+ V(0xE8), V(0xE9), V(0xEA), V(0xEB), V(0xEC), V(0xED), V(0xEE), V(0xEF),
+ V(0xF0), V(0xF1), V(0xF2), V(0xF3), V(0xF4), V(0xF5), V(0xF6), V(0xF7),
+ V(0xF8), V(0xF9), V(0xFA), V(0xFB), V(0xFC), V(0xFD), V(0xFE),
+#undef V
+ };
+
+ cmsCIEXYZ dst[PL_ARRAY_SIZE(test)] = {0};
+ cmsDoTransform(tf, test, dst, PL_ARRAY_SIZE(dst));
+ cmsDeleteTransform(tf);
+
+ // Read primaries from transformed RGBW values
+ prim->red = pl_cie_from_XYZ(dst[RED].X, dst[RED].Y, dst[RED].Z);
+ prim->green = pl_cie_from_XYZ(dst[GREEN].X, dst[GREEN].Y, dst[GREEN].Z);
+ prim->blue = pl_cie_from_XYZ(dst[BLUE].X, dst[BLUE].Y, dst[BLUE].Z);
+ prim->white = pl_cie_from_XYZ(dst[WHITE].X, dst[WHITE].Y, dst[WHITE].Z);
+
+ // Rough estimate of overall gamma and starting point for curve black point
+ const float y_approx = dst[GRAY].Y ? log(dst[GRAY].Y) / log(0.5) : 1.0f;
+ const float kb = fmaxf(dst[BLACK].Y, 0.0f);
+ float b = powf(kb, 1 / y_approx);
+
+ // Estimate mean and stddev of gamma (Welford's method)
+ float M = 0.0, S = 0.0;
+ int k = 1;
+ for (int i = RAMP; i < PL_ARRAY_SIZE(dst); i++) { // exclude primaries
+ if (dst[i].Y <= 0 || dst[i].Y >= 1)
+ continue;
+ float src = (1 - b) * (test[i][0] / 255.0) + b;
+ float y = log(dst[i].Y) / log(src);
+ float tmpM = M;
+ M += (y - tmpM) / k;
+ S += (y - tmpM) * (y - M);
+ k++;
+
+ // Update estimate of black point according to current gamma estimate
+ b = powf(kb, 1 / M);
+ }
+ S = sqrt(S / (k - 1));
+
+ PL_INFO(p, "Detected profile approximation gamma %.3f", M);
+ if (S > 0.5) {
+ PL_WARN(p, "Detected profile gamma (%.3f) very far from pure power "
+ "response (stddev=%.1f), suspected unusual or broken profile. "
+ "Using anyway, but results may be poor.", M, S);
+ } else if (!(M > 0)) {
+ PL_ERR(p, "Arithmetic error in ICC profile gamma estimation? "
+ "Please open an issue");
+ return false;
+ }
+
+ *out_gamma = M;
+ p->gamma_stddev = S;
+ return true;
+}
+
+static bool detect_contrast(pl_icc_object icc, struct pl_hdr_metadata *hdr,
+ struct pl_icc_params *params, float max_luma)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsCIEXYZ *white = cmsReadTag(p->profile, cmsSigLuminanceTag);
+ enum pl_rendering_intent intent = params->intent;
+ /* LittleCMS refuses to detect an intent in absolute colorimetric intent,
+ * so fall back to relative colorimetric since we only care about the
+ * brightness value here */
+ if (intent == PL_INTENT_ABSOLUTE_COLORIMETRIC)
+ intent = PL_INTENT_RELATIVE_COLORIMETRIC;
+ if (!cmsDetectDestinationBlackPoint(&p->black, p->profile, intent, 0)) {
+ /*
+ * v4 ICC profiles have a black point tag but only for
+ * perceptual/saturation intents. So we change the rendering intent
+ * to perceptual if we are provided a v4 ICC profile.
+ */
+ if (cmsGetEncodedICCversion(p->profile) >= 0x4000000 && intent != PL_INTENT_PERCEPTUAL) {
+ params->intent = PL_INTENT_PERCEPTUAL;
+ return detect_contrast(icc, hdr, params, max_luma);
+ }
+
+ PL_ERR(p, "Failed detecting ICC profile black point!");
+ return false;
+ }
+
+ if (white) {
+ PL_DEBUG(p, "Detected raw white point X=%.2f Y=%.2f Z=%.2f cd/m^2",
+ white->X, white->Y, white->Z);
+ }
+ PL_DEBUG(p, "Detected raw black point X=%.6f%% Y=%.6f%% Z=%.6f%%",
+ p->black.X * 100, p->black.Y * 100, p->black.Z * 100);
+
+ if (max_luma <= 0)
+ max_luma = white ? white->Y : PL_COLOR_SDR_WHITE;
+
+ hdr->max_luma = max_luma;
+ hdr->min_luma = p->black.Y * max_luma;
+ hdr->min_luma = PL_MAX(hdr->min_luma, 1e-6); // prevent true 0
+ PL_INFO(p, "Using ICC contrast %.0f:1", hdr->max_luma / hdr->min_luma);
+ return true;
+}
+
+static void infer_clut_size(struct pl_icc_object_t *icc)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ struct pl_icc_params *params = &icc->params;
+ if (params->size_r && params->size_g && params->size_b) {
+ PL_DEBUG(p, "Using fixed 3DLUT size: %dx%dx%d",
+ (int) params->size_r, (int) params->size_g, (int) params->size_b);
+ return;
+ }
+
+#define REQUIRE_SIZE(N) \
+ params->size_r = PL_MAX(params->size_r, N); \
+ params->size_g = PL_MAX(params->size_g, N); \
+ params->size_b = PL_MAX(params->size_b, N)
+
+ // Default size for sanity
+ REQUIRE_SIZE(9);
+
+ // Ensure enough precision to track the (absolute) black point
+ if (p->black.Y > 1e-4) {
+ float black_rel = powf(p->black.Y, 1.0f / icc->gamma);
+ int min_size = 2 * (int) ceilf(1.0f / black_rel);
+ REQUIRE_SIZE(min_size);
+ }
+
+ // Ensure enough precision to track the gamma curve
+ if (p->gamma_stddev > 1e-2) {
+ REQUIRE_SIZE(65);
+ } else if (p->gamma_stddev > 1e-3) {
+ REQUIRE_SIZE(33);
+ } else if (p->gamma_stddev > 1e-4) {
+ REQUIRE_SIZE(17);
+ }
+
+ // Ensure enough precision to track any internal CLUTs
+ cmsPipeline *pipe = NULL;
+ switch (icc->params.intent) {
+ case PL_INTENT_SATURATION:
+ pipe = cmsReadTag(p->profile, cmsSigBToA2Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_RELATIVE_COLORIMETRIC:
+ case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+ default:
+ pipe = cmsReadTag(p->profile, cmsSigBToA1Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_PERCEPTUAL:
+ pipe = cmsReadTag(p->profile, cmsSigBToA0Tag);
+ break;
+ }
+
+ if (!pipe) {
+ switch (icc->params.intent) {
+ case PL_INTENT_SATURATION:
+ pipe = cmsReadTag(p->profile, cmsSigAToB2Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_RELATIVE_COLORIMETRIC:
+ case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+ default:
+ pipe = cmsReadTag(p->profile, cmsSigAToB1Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_PERCEPTUAL:
+ pipe = cmsReadTag(p->profile, cmsSigAToB0Tag);
+ break;
+ }
+ }
+
+ if (pipe) {
+ for (cmsStage *stage = cmsPipelineGetPtrToFirstStage(pipe);
+ stage; stage = cmsStageNext(stage))
+ {
+ switch (cmsStageType(stage)) {
+ case cmsSigCLutElemType: ;
+ _cmsStageCLutData *data = cmsStageData(stage);
+ if (data->Params->nInputs != 3)
+ continue;
+ params->size_r = PL_MAX(params->size_r, data->Params->nSamples[0]);
+ params->size_g = PL_MAX(params->size_g, data->Params->nSamples[1]);
+ params->size_b = PL_MAX(params->size_b, data->Params->nSamples[2]);
+ break;
+
+ default:
+ continue;
+ }
+ }
+ }
+
+ // Clamp the output size to make sure profiles are not too large
+ params->size_r = PL_MIN(params->size_r, 129);
+ params->size_g = PL_MIN(params->size_g, 129);
+ params->size_b = PL_MIN(params->size_b, 129);
+
+ // Constrain the total LUT size to roughly 1M entries
+ const size_t max_size = 1000000;
+ size_t total_size = params->size_r * params->size_g * params->size_b;
+ if (total_size > max_size) {
+ float factor = powf((float) max_size / total_size, 1/3.0f);
+ params->size_r = ceilf(factor * params->size_r);
+ params->size_g = ceilf(factor * params->size_g);
+ params->size_b = ceilf(factor * params->size_b);
+ }
+
+ PL_INFO(p, "Chosen 3DLUT size: %dx%dx%d",
+ (int) params->size_r, (int) params->size_g, (int) params->size_b);
+}
+
+static bool icc_init(struct pl_icc_object_t *icc)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ struct pl_icc_params *params = &icc->params;
+ if (params->intent < 0 || params->intent > PL_INTENT_ABSOLUTE_COLORIMETRIC)
+ params->intent = cmsGetHeaderRenderingIntent(p->profile);
+
+ struct pl_raw_primaries *out_prim = &icc->csp.hdr.prim;
+ if (!detect_csp(icc, out_prim, &icc->gamma))
+ return false;
+ if (!detect_contrast(icc, &icc->csp.hdr, params, params->max_luma))
+ return false;
+ infer_clut_size(icc);
+
+ const struct pl_raw_primaries *best = NULL;
+ for (enum pl_color_primaries prim = 1; prim < PL_COLOR_PRIM_COUNT; prim++) {
+ const struct pl_raw_primaries *raw = pl_raw_primaries_get(prim);
+ if (!icc->csp.primaries && pl_raw_primaries_similar(raw, out_prim)) {
+ icc->containing_primaries = prim;
+ icc->csp.primaries = prim;
+ best = raw;
+ break;
+ }
+
+ if (pl_primaries_superset(raw, out_prim) &&
+ (!best || pl_primaries_superset(best, raw)))
+ {
+ icc->containing_primaries = prim;
+ best = raw;
+ }
+ }
+
+ if (!best) {
+ PL_WARN(p, "ICC profile too wide to handle, colors may be clipped!");
+ icc->containing_primaries = PL_COLOR_PRIM_ACES_AP0;
+ best = pl_raw_primaries_get(icc->containing_primaries);
+ }
+
+ // Create approximation profile. Use a tone-curve based on a BT.1886-style
+ // pure power curve, with an approximation gamma matched to the ICC
+ // profile. We stretch the luminance range *before* the input to the gamma
+ // function, to avoid numerical issues near the black point. (This removes
+ // the need for a separate linear section)
+ //
+ // Y = scale * (aX + b)^y, where Y = PCS luma and X = encoded value ([0-1])
+ p->scale = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_NORM, icc->csp.hdr.max_luma);
+ p->b = powf(icc->csp.hdr.min_luma / icc->csp.hdr.max_luma, 1.0f / icc->gamma);
+ p->a = (1 - p->b);
+ cmsToneCurve *curve = cmsBuildParametricToneCurve(p->cms, 2,
+ (double[3]) { icc->gamma, p->a, p->b });
+ if (!curve)
+ return false;
+
+ cmsCIExyY wp_xyY = { best->white.x, best->white.y, 1.0 };
+ cmsCIExyYTRIPLE prim_xyY = {
+ .Red = { best->red.x, best->red.y, 1.0 },
+ .Green = { best->green.x, best->green.y, 1.0 },
+ .Blue = { best->blue.x, best->blue.y, 1.0 },
+ };
+
+ p->approx = cmsCreateRGBProfileTHR(p->cms, &wp_xyY, &prim_xyY,
+ (cmsToneCurve *[3]){ curve, curve, curve });
+ cmsFreeToneCurve(curve);
+ if (!p->approx)
+ return false;
+
+ // We need to create an ICC V2 profile because ICC V4 perceptual profiles
+ // have normalized semantics, but we want colorimetric mapping with BPC
+ cmsSetHeaderRenderingIntent(p->approx, icc->params.intent);
+ cmsSetProfileVersion(p->approx, 2.2);
+
+ // Hash all parameters affecting the generated 3DLUT
+ p->lut_sig = CACHE_KEY_ICC_3DLUT;
+ pl_hash_merge(&p->lut_sig, icc->signature);
+ pl_hash_merge(&p->lut_sig, params->intent);
+ pl_hash_merge(&p->lut_sig, params->size_r);
+ pl_hash_merge(&p->lut_sig, params->size_g);
+ pl_hash_merge(&p->lut_sig, params->size_b);
+ pl_hash_merge(&p->lut_sig, params->force_bpc);
+ union { double d; uint64_t u; } v = { .d = icc->csp.hdr.max_luma };
+ pl_hash_merge(&p->lut_sig, v.u);
+ // min luma depends only on the max luma and profile
+
+ // Backwards compatibility with old caching API
+ if ((params->cache_save || params->cache_load) && !params->cache) {
+ p->cache = pl_cache_create(pl_cache_params(
+ .log = p->log,
+ .set = params->cache_save ? set_callback : NULL,
+ .get = params->cache_load ? get_callback : NULL,
+ .priv = icc,
+ ));
+ }
+
+ return true;
+}
+
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params)
+{
+ if (!profile->len)
+ return NULL;
+
+ struct pl_icc_object_t *icc = pl_zalloc_obj(NULL, icc, struct icc_priv);
+ struct icc_priv *p = PL_PRIV(icc);
+ icc->params = params ? *params : pl_icc_default_params;
+ icc->signature = profile->signature;
+ p->log = log;
+ p->cms = cmsCreateContext(NULL, (void *) log);
+ if (!p->cms) {
+ PL_ERR(p, "Failed creating LittleCMS context!");
+ goto error;
+ }
+
+ cmsSetLogErrorHandlerTHR(p->cms, error_callback);
+ PL_INFO(p, "Opening ICC profile..");
+ p->profile = cmsOpenProfileFromMemTHR(p->cms, profile->data, profile->len);
+ if (!p->profile) {
+ PL_ERR(p, "Failed opening ICC profile");
+ goto error;
+ }
+
+ if (cmsGetColorSpace(p->profile) != cmsSigRgbData) {
+ PL_ERR(p, "Invalid ICC profile: not RGB");
+ goto error;
+ }
+
+ if (!icc_init(icc))
+ goto error;
+
+ return icc;
+
+error:
+ pl_icc_close((pl_icc_object *) &icc);
+ return NULL;
+}
+
+static bool icc_reopen(pl_icc_object kicc, const struct pl_icc_params *params)
+{
+ struct pl_icc_object_t *icc = (struct pl_icc_object_t *) kicc;
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsCloseProfile(p->approx);
+ pl_cache_destroy(&p->cache);
+
+ *icc = (struct pl_icc_object_t) {
+ .params = *params,
+ .signature = icc->signature,
+ };
+
+ *p = (struct icc_priv) {
+ .log = p->log,
+ .cms = p->cms,
+ .profile = p->profile,
+ };
+
+ PL_DEBUG(p, "Reinitializing ICC profile in-place");
+ return icc_init(icc);
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *out_icc,
+ const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params)
+{
+ params = PL_DEF(params, &pl_icc_default_params);
+ pl_icc_object icc = *out_icc;
+ if (!icc && !profile)
+ return false; // nothing to update
+
+ uint64_t sig = profile ? profile->signature : icc->signature;
+ if (!icc || icc->signature != sig) {
+ pl_assert(profile);
+ pl_icc_close(&icc);
+ *out_icc = icc = pl_icc_open(log, profile, params);
+ return icc != NULL;
+ }
+
+ int size_r = PL_DEF(params->size_r, icc->params.size_r);
+ int size_g = PL_DEF(params->size_g, icc->params.size_g);
+ int size_b = PL_DEF(params->size_b, icc->params.size_b);
+ bool compat = params->intent == icc->params.intent &&
+ params->max_luma == icc->params.max_luma &&
+ params->force_bpc == icc->params.force_bpc &&
+ size_r == icc->params.size_r &&
+ size_g == icc->params.size_g &&
+ size_b == icc->params.size_b;
+ if (compat)
+ return true;
+
+ // ICC signature is the same but parameters are different, re-open in-place
+ if (!icc_reopen(icc, params)) {
+ pl_icc_close(&icc);
+ *out_icc = NULL;
+ return false;
+ }
+
+ return true;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params, bool decode)
+{
+ pl_icc_object icc = params->priv;
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsHPROFILE srcp = decode ? p->profile : p->approx;
+ cmsHPROFILE dstp = decode ? p->approx : p->profile;
+ int s_r = params->width, s_g = params->height, s_b = params->depth;
+
+ pl_clock_t start = pl_clock_now();
+ cmsHTRANSFORM tf = cmsCreateTransformTHR(p->cms, srcp, TYPE_RGB_16,
+ dstp, TYPE_RGBA_16,
+ icc->params.intent,
+ cmsFLAGS_BLACKPOINTCOMPENSATION |
+ cmsFLAGS_NOCACHE | cmsFLAGS_NOOPTIMIZE);
+ if (!tf)
+ return;
+
+ pl_clock_t after_transform = pl_clock_now();
+ pl_log_cpu_time(p->log, start, after_transform, "creating ICC transform");
+
+ uint16_t *tmp = pl_alloc(NULL, s_r * 3 * sizeof(tmp[0]));
+ for (int b = 0; b < s_b; b++) {
+ for (int g = 0; g < s_g; g++) {
+ // Transform a single line of the output buffer
+ for (int r = 0; r < s_r; r++) {
+ tmp[r * 3 + 0] = r * 65535 / (s_r - 1);
+ tmp[r * 3 + 1] = g * 65535 / (s_g - 1);
+ tmp[r * 3 + 2] = b * 65535 / (s_b - 1);
+ }
+
+ size_t offset = (b * s_g + g) * s_r * 4;
+ uint16_t *data = ((uint16_t *) datap) + offset;
+ cmsDoTransform(tf, tmp, data, s_r);
+
+ if (!icc->params.force_bpc)
+ continue;
+
+ // Fix the black point manually. Work-around for "improper"
+ // profiles, as black point compensation should already have
+ // taken care of this normally.
+ const uint16_t knee = 16u << 8;
+ if (tmp[0] >= knee || tmp[1] >= knee)
+ continue;
+ for (int r = 0; r < s_r; r++) {
+ uint16_t s = (2 * tmp[1] + tmp[2] + tmp[r * 3]) >> 2;
+ if (s >= knee)
+ break;
+ for (int c = 0; c < 3; c++)
+ data[r * 3 + c] = (s * data[r * 3 + c] + (knee - s) * s) >> 12;
+ }
+ }
+ }
+
+ pl_log_cpu_time(p->log, after_transform, pl_clock_now(), "generating ICC 3DLUT");
+ cmsDeleteTransform(tf);
+ pl_free(tmp);
+}
+
+static void fill_decode(void *datap, const struct sh_lut_params *params)
+{
+ fill_lut(datap, params, true);
+}
+
+static void fill_encode(void *datap, const struct sh_lut_params *params)
+{
+ fill_lut(datap, params, false);
+}
+
+static pl_cache get_cache(pl_icc_object icc, pl_shader sh)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ return PL_DEF(icc->params.cache, PL_DEF(p->cache, SH_CACHE(sh)));
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+ struct pl_color_space *out_csp)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+ if (!fmt) {
+ SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+ return;
+ }
+
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = lut_obj,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_TETRAHEDRAL,
+ .fmt = fmt,
+ .width = icc->params.size_r,
+ .height = icc->params.size_g,
+ .depth = icc->params.size_b,
+ .comps = 4,
+ .signature = p->lut_sig,
+ .fill = fill_decode,
+ .cache = get_cache(icc, sh),
+ .priv = (void *) icc,
+ ));
+
+ if (!lut) {
+ SH_FAIL(sh, "pl_icc_decode: failed generating LUT object");
+ return;
+ }
+
+ // Y = scale * (aX + b)^y
+ sh_describe(sh, "ICC 3DLUT");
+ GLSL("// pl_icc_decode \n"
+ "{ \n"
+ "color.rgb = "$"(color.rgb).rgb; \n"
+ "color.rgb = "$" * color.rgb + vec3("$"); \n"
+ "color.rgb = pow(color.rgb, vec3("$")); \n"
+ "color.rgb = "$" * color.rgb; \n"
+ "} \n",
+ lut,
+ SH_FLOAT(p->a), SH_FLOAT(p->b),
+ SH_FLOAT(icc->gamma),
+ SH_FLOAT(p->scale));
+
+ if (out_csp) {
+ *out_csp = (struct pl_color_space) {
+ .primaries = icc->containing_primaries,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ .hdr = icc->csp.hdr,
+ };
+ }
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+ if (!fmt) {
+ SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+ return;
+ }
+
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = lut_obj,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_TETRAHEDRAL,
+ .fmt = fmt,
+ .width = icc->params.size_r,
+ .height = icc->params.size_g,
+ .depth = icc->params.size_b,
+ .comps = 4,
+ .signature = ~p->lut_sig, // avoid confusion with decoding LUTs
+ .fill = fill_encode,
+ .cache = get_cache(icc, sh),
+ .priv = (void *) icc,
+ ));
+
+ if (!lut) {
+ SH_FAIL(sh, "pl_icc_encode: failed generating LUT object");
+ return;
+ }
+
+ // X = 1/a * (Y/scale)^(1/y) - b/a
+ sh_describe(sh, "ICC 3DLUT");
+ GLSL("// pl_icc_encode \n"
+ "{ \n"
+ "color.rgb = max(color.rgb, 0.0); \n"
+ "color.rgb = 1.0/"$" * color.rgb; \n"
+ "color.rgb = pow(color.rgb, vec3("$")); \n"
+ "color.rgb = 1.0/"$" * color.rgb - "$"; \n"
+ "color.rgb = "$"(color.rgb).rgb; \n"
+ "} \n",
+ SH_FLOAT(p->scale),
+ SH_FLOAT(1.0f / icc->gamma),
+ SH_FLOAT(p->a), SH_FLOAT(p->b / p->a),
+ lut);
+}
+
+#else // !PL_HAVE_LCMS
+
+void pl_icc_close(pl_icc_object *picc) {};
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+ const struct pl_icc_params *pparams)
+{
+ pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+ return NULL;
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *obj,
+ const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params)
+{
+ static bool warned;
+ if (!warned) {
+ pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+ warned = true;
+ }
+ *obj = NULL;
+ return false;
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+ struct pl_color_space *out_csp)
+{
+ pl_unreachable(); // can't get a pl_icc_object
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+ pl_unreachable();
+}
+
+#endif
diff --git a/src/shaders/lut.c b/src/shaders/lut.c
new file mode 100644
index 0000000..b0124fc
--- /dev/null
+++ b/src/shaders/lut.c
@@ -0,0 +1,820 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <ctype.h>
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/lut.h>
+
+static inline bool isnumeric(char c)
+{
+ return (c >= '0' && c <= '9') || c == '-';
+}
+
+void pl_lut_free(struct pl_custom_lut **lut)
+{
+ pl_free_ptr(lut);
+}
+
+struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *cstr, size_t cstr_len)
+{
+ struct pl_custom_lut *lut = pl_zalloc_ptr(NULL, lut);
+ pl_str str = (pl_str) { (uint8_t *) cstr, cstr_len };
+ lut->signature = pl_str_hash(str);
+ int entries = 0;
+
+ float min[3] = { 0.0, 0.0, 0.0 };
+ float max[3] = { 1.0, 1.0, 1.0 };
+
+ // Parse header
+ while (str.len && !isnumeric(str.buf[0])) {
+ pl_str line = pl_str_strip(pl_str_getline(str, &str));
+ if (!line.len)
+ continue; // skip empty line
+
+ if (pl_str_eatstart0(&line, "TITLE")) {
+ pl_info(log, "Loading LUT: %.*s", PL_STR_FMT(pl_str_strip(line)));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "LUT_3D_SIZE")) {
+ line = pl_str_strip(line);
+ int size;
+ if (!pl_str_parse_int(line, &size)) {
+ pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ if (size <= 0 || size > 1024) {
+ pl_err(log, "Invalid 3DLUT size: %dx%d%x", size, size, size);
+ goto error;
+ }
+
+ lut->size[0] = lut->size[1] = lut->size[2] = size;
+ entries = size * size * size;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "LUT_1D_SIZE")) {
+ line = pl_str_strip(line);
+ int size;
+ if (!pl_str_parse_int(line, &size)) {
+ pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ if (size <= 0 || size > 65536) {
+ pl_err(log, "Invalid 1DLUT size: %d", size);
+ goto error;
+ }
+
+ lut->size[0] = size;
+ lut->size[1] = lut->size[2] = 0;
+ entries = size;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DOMAIN_MIN")) {
+ line = pl_str_strip(line);
+ if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[0]) ||
+ !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[1]) ||
+ !pl_str_parse_float(line, &min[2]))
+ {
+ pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DOMAIN_MAX")) {
+ line = pl_str_strip(line);
+ if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[0]) ||
+ !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[1]) ||
+ !pl_str_parse_float(line, &max[2]))
+ {
+ pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "#")) {
+ pl_debug(log, "Unhandled .cube comment: %.*s",
+ PL_STR_FMT(pl_str_strip(line)));
+ continue;
+ }
+
+ pl_warn(log, "Unhandled .cube line: %.*s", PL_STR_FMT(pl_str_strip(line)));
+ }
+
+ if (!entries) {
+ pl_err(log, "Missing LUT size specification?");
+ goto error;
+ }
+
+ for (int i = 0; i < 3; i++) {
+ if (max[i] - min[i] < 1e-6) {
+ pl_err(log, "Invalid domain range: [%f, %f]", min[i], max[i]);
+ goto error;
+ }
+ }
+
+ float *data = pl_alloc(lut, sizeof(float[3]) * entries);
+ lut->data = data;
+
+ // Parse LUT body
+ pl_clock_t start = pl_clock_now();
+ for (int n = 0; n < entries; n++) {
+ for (int c = 0; c < 3; c++) {
+ static const char * const digits = "0123456789.-+e";
+
+ // Extract valid digit sequence
+ size_t len = pl_strspn(str, digits);
+ pl_str entry = (pl_str) { str.buf, len };
+ str.buf += len;
+ str.len -= len;
+
+ if (!entry.len) {
+ if (!str.len) {
+ pl_err(log, "Failed parsing LUT: Unexpected EOF, expected "
+ "%d entries, got %d", entries * 3, n * 3 + c + 1);
+ } else {
+ pl_err(log, "Failed parsing LUT: Unexpected '%c', expected "
+ "digit", str.buf[0]);
+ }
+ goto error;
+ }
+
+ float num;
+ if (!pl_str_parse_float(entry, &num)) {
+ pl_err(log, "Failed parsing float value '%.*s'", PL_STR_FMT(entry));
+ goto error;
+ }
+
+ // Rescale to range 0.0 - 1.0
+ *data++ = (num - min[c]) / (max[c] - min[c]);
+
+ // Skip whitespace between digits
+ str = pl_str_strip(str);
+ }
+ }
+
+ str = pl_str_strip(str);
+ if (str.len)
+ pl_warn(log, "Extra data after LUT?... ignoring '%c'", str.buf[0]);
+
+ pl_log_cpu_time(log, start, pl_clock_now(), "parsing .cube LUT");
+ return lut;
+
+error:
+ pl_free(lut);
+ return NULL;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params)
+{
+ const struct pl_custom_lut *lut = params->priv;
+
+ int dim_r = params->width;
+ int dim_g = PL_DEF(params->height, 1);
+ int dim_b = PL_DEF(params->depth, 1);
+
+ float *data = datap;
+ for (int b = 0; b < dim_b; b++) {
+ for (int g = 0; g < dim_g; g++) {
+ for (int r = 0; r < dim_r; r++) {
+ size_t offset = (b * dim_g + g) * dim_r + r;
+ const float *src = &lut->data[offset * 3];
+ float *dst = &data[offset * 4];
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = 0.0f;
+ }
+ }
+ }
+}
+
+void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut,
+ pl_shader_obj *lut_state)
+{
+ if (!lut)
+ return;
+
+ int dims;
+ if (lut->size[0] > 0 && lut->size[1] > 0 && lut->size[2] > 0) {
+ dims = 3;
+ } else if (lut->size[0] > 0 && !lut->size[1] && !lut->size[2]) {
+ dims = 1;
+ } else {
+ SH_FAIL(sh, "Invalid dimensions %dx%dx%d for pl_custom_lut, must be 1D "
+ "or 3D!", lut->size[0], lut->size[1], lut->size[2]);
+ return;
+ }
+
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ ident_t fun = sh_lut(sh, sh_lut_params(
+ .object = lut_state,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_TETRAHEDRAL,
+ .width = lut->size[0],
+ .height = lut->size[1],
+ .depth = lut->size[2],
+ .comps = 4, // for better texel alignment
+ .signature = lut->signature,
+ .fill = fill_lut,
+ .priv = (void *) lut,
+ ));
+
+ if (!fun) {
+ SH_FAIL(sh, "pl_shader_custom_lut: failed generating LUT object");
+ return;
+ }
+
+ GLSL("// pl_shader_custom_lut \n");
+
+ static const pl_matrix3x3 zero = {0};
+ if (memcmp(&lut->shaper_in, &zero, sizeof(zero)) != 0) {
+ GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("shaper_in"),
+ .data = PL_TRANSPOSE_3X3(lut->shaper_in.m),
+ }));
+ }
+
+ switch (dims) {
+ case 1:
+ sh_describe(sh, "custom 1DLUT");
+ GLSL("color.rgb = vec3("$"(color.r).r, \n"
+ " "$"(color.g).g, \n"
+ " "$"(color.b).b); \n",
+ fun, fun, fun);
+ break;
+ case 3:
+ sh_describe(sh, "custom 3DLUT");
+ GLSL("color.rgb = "$"(color.rgb).rgb; \n", fun);
+ break;
+ }
+
+ if (memcmp(&lut->shaper_out, &zero, sizeof(zero)) != 0) {
+ GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("shaper_out"),
+ .data = PL_TRANSPOSE_3X3(lut->shaper_out.m),
+ }));
+ }
+}
+
+// Defines a LUT position helper macro. This translates from an absolute texel
+// scale (either in texels, or normalized to [0,1]) to the texture coordinate
+// scale for the corresponding sample in a texture of dimension `lut_size`.
+static ident_t texel_scale(pl_shader sh, int lut_size, bool normalized)
+{
+ const float base = 0.5f / lut_size;
+ const float end = 1.0f - 0.5f / lut_size;
+ const float scale = (end - base) / (normalized ? 1.0f : (lut_size - 1));
+
+ ident_t name = sh_fresh(sh, "LUT_SCALE");
+ GLSLH("#define "$"(x) ("$" * (x) + "$") \n",
+ name, SH_FLOAT(scale), SH_FLOAT(base));
+ return name;
+}
+
+struct sh_lut_obj {
+ enum sh_lut_type type;
+ enum sh_lut_method method;
+ enum pl_var_type vartype;
+ pl_fmt fmt;
+ int width, height, depth, comps;
+ uint64_t signature;
+ bool error; // reset if params change
+
+ // weights, depending on the lut type
+ pl_tex tex;
+ pl_str str;
+ void *data;
+};
+
+static void sh_lut_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_lut_obj *lut = ptr;
+ pl_tex_destroy(gpu, &lut->tex);
+ pl_free(lut->str.buf);
+ pl_free(lut->data);
+
+ *lut = (struct sh_lut_obj) {0};
+}
+
+// Maximum number of floats to embed as a literal array (when using SH_LUT_AUTO)
+#define SH_LUT_MAX_LITERAL_SOFT 64
+#define SH_LUT_MAX_LITERAL_HARD 256
+
+ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params)
+{
+ pl_gpu gpu = SH_GPU(sh);
+ pl_cache_obj obj = { .key = CACHE_KEY_SH_LUT ^ params->signature };
+
+ const enum pl_var_type vartype = params->var_type;
+ pl_assert(vartype != PL_VAR_INVALID);
+ pl_assert(params->method == SH_LUT_NONE || vartype == PL_VAR_FLOAT);
+ pl_assert(params->width > 0 && params->height >= 0 && params->depth >= 0);
+ pl_assert(params->comps > 0);
+ pl_assert(!params->cache || params->signature);
+
+ int sizes[] = { params->width, params->height, params->depth };
+ int size = params->width * PL_DEF(params->height, 1) * PL_DEF(params->depth, 1);
+ int dims = params->depth ? 3 : params->height ? 2 : 1;
+ enum sh_lut_method method = params->method;
+ if (method == SH_LUT_TETRAHEDRAL && dims != 3)
+ method = SH_LUT_LINEAR;
+ if (method == SH_LUT_CUBIC && dims != 3)
+ method = SH_LUT_LINEAR;
+
+ int texdim = 0;
+ uint32_t max_tex_dim[] = {
+ gpu ? gpu->limits.max_tex_1d_dim : 0,
+ gpu ? gpu->limits.max_tex_2d_dim : 0,
+ (gpu && gpu->glsl.version > 100) ? gpu->limits.max_tex_3d_dim : 0,
+ };
+
+ struct sh_lut_obj *lut = SH_OBJ(sh, params->object, PL_SHADER_OBJ_LUT,
+ struct sh_lut_obj, sh_lut_uninit);
+
+ if (!lut)
+ return NULL_IDENT;
+
+ bool update = params->update || lut->signature != params->signature ||
+ vartype != lut->vartype || params->fmt != lut->fmt ||
+ params->width != lut->width || params->height != lut->height ||
+ params->depth != lut->depth || params->comps != lut->comps;
+
+ if (lut->error && !update)
+ return NULL_IDENT; // suppress error spam until something changes
+
+ // Try picking the right number of dimensions for the texture LUT. This
+ // allows e.g. falling back to 2D textures if 1D textures are unsupported.
+ for (int d = dims; d <= PL_ARRAY_SIZE(max_tex_dim); d++) {
+ // For a given dimension to be compatible, all coordinates need to be
+ // within the maximum texture size for that dimension
+ for (int i = 0; i < d; i++) {
+ if (sizes[i] > max_tex_dim[d - 1])
+ goto next_dim;
+ }
+
+ // All dimensions are compatible, so pick this texture dimension
+ texdim = d;
+ break;
+
+next_dim: ; // `continue` out of the inner loop
+ }
+
+ static const enum pl_fmt_type fmt_type[PL_VAR_TYPE_COUNT] = {
+ [PL_VAR_SINT] = PL_FMT_SINT,
+ [PL_VAR_UINT] = PL_FMT_UINT,
+ [PL_VAR_FLOAT] = PL_FMT_FLOAT,
+ };
+
+ enum pl_fmt_caps texcaps = PL_FMT_CAP_SAMPLEABLE;
+ bool is_linear = method == SH_LUT_LINEAR || method == SH_LUT_CUBIC;
+ if (is_linear)
+ texcaps |= PL_FMT_CAP_LINEAR;
+
+ pl_fmt texfmt = params->fmt;
+ if (texfmt) {
+ bool ok;
+ switch (texfmt->type) {
+ case PL_FMT_SINT: ok = vartype == PL_VAR_SINT; break;
+ case PL_FMT_UINT: ok = vartype == PL_VAR_UINT; break;
+ default: ok = vartype == PL_VAR_FLOAT; break;
+ }
+
+ if (!ok) {
+ PL_ERR(sh, "Specified texture format '%s' does not match LUT "
+ "data type!", texfmt->name);
+ goto error;
+ }
+
+ if (~texfmt->caps & texcaps) {
+ PL_ERR(sh, "Specified texture format '%s' does not match "
+ "required capabilities 0x%x!\n", texfmt->name, texcaps);
+ goto error;
+ }
+ }
+
+ if (texdim && !texfmt) {
+ texfmt = pl_find_fmt(gpu, fmt_type[vartype], params->comps,
+ vartype == PL_VAR_FLOAT ? 16 : 32,
+ pl_var_type_size(vartype) * 8,
+ texcaps);
+ }
+
+ enum sh_lut_type type = params->lut_type;
+
+ // The linear sampling code currently only supports 1D linear interpolation
+ if (is_linear && dims > 1) {
+ if (texfmt) {
+ type = SH_LUT_TEXTURE;
+ } else {
+ PL_ERR(sh, "Can't emulate linear LUTs for 2D/3D LUTs and no "
+ "texture support available!");
+ goto error;
+ }
+ }
+
+ bool can_uniform = gpu && gpu->limits.max_variable_comps >= size * params->comps;
+ bool can_literal = sh_glsl(sh).version > 110; // needed for literal arrays
+ can_literal &= size <= SH_LUT_MAX_LITERAL_HARD && !params->dynamic;
+
+ // Deselect unsupported methods
+ if (type == SH_LUT_UNIFORM && !can_uniform)
+ type = SH_LUT_AUTO;
+ if (type == SH_LUT_LITERAL && !can_literal)
+ type = SH_LUT_AUTO;
+ if (type == SH_LUT_TEXTURE && !texfmt)
+ type = SH_LUT_AUTO;
+
+ // Sorted by priority
+ if (!type && can_literal && !method && size <= SH_LUT_MAX_LITERAL_SOFT)
+ type = SH_LUT_LITERAL;
+ if (!type && texfmt)
+ type = SH_LUT_TEXTURE;
+ if (!type && can_uniform)
+ type = SH_LUT_UNIFORM;
+ if (!type && can_literal)
+ type = SH_LUT_LITERAL;
+
+ if (!type) {
+ PL_ERR(sh, "Can't generate LUT: no compatible methods!");
+ goto error;
+ }
+
+ // Reinitialize the existing LUT if needed
+ update |= type != lut->type;
+ update |= method != lut->method;
+
+ if (update) {
+ if (params->dynamic)
+ pl_log_level_cap(sh->log, PL_LOG_TRACE);
+
+ size_t el_size = params->comps * pl_var_type_size(vartype);
+ if (type == SH_LUT_TEXTURE)
+ el_size = texfmt->texel_size;
+
+ size_t buf_size = size * el_size;
+ if (pl_cache_get(params->cache, &obj) && obj.size == buf_size) {
+ PL_DEBUG(sh, "Re-using cached LUT (0x%"PRIx64") with size %zu",
+ obj.key, obj.size);
+ } else {
+ PL_DEBUG(sh, "LUT invalidated, regenerating..");
+ pl_cache_obj_resize(NULL, &obj, buf_size);
+ pl_clock_t start = pl_clock_now();
+ params->fill(obj.data, params);
+ pl_log_cpu_time(sh->log, start, pl_clock_now(), "generating shader LUT");
+ }
+
+ pl_assert(obj.data && obj.size);
+ if (params->dynamic)
+ pl_log_level_cap(sh->log, PL_LOG_NONE);
+
+ switch (type) {
+ case SH_LUT_TEXTURE: {
+ if (!texdim) {
+ PL_ERR(sh, "Texture LUT exceeds texture dimensions!");
+ goto error;
+ }
+
+ if (!texfmt) {
+ PL_ERR(sh, "Found no compatible texture format for LUT!");
+ goto error;
+ }
+
+ struct pl_tex_params tex_params = {
+ .w = params->width,
+ .h = PL_DEF(params->height, texdim >= 2 ? 1 : 0),
+ .d = PL_DEF(params->depth, texdim >= 3 ? 1 : 0),
+ .format = texfmt,
+ .sampleable = true,
+ .host_writable = params->dynamic,
+ .initial_data = params->dynamic ? NULL : obj.data,
+ .debug_tag = params->debug_tag,
+ };
+
+ bool ok;
+ if (params->dynamic) {
+ ok = pl_tex_recreate(gpu, &lut->tex, &tex_params);
+ if (ok) {
+ ok = pl_tex_upload(gpu, pl_tex_transfer_params(
+ .tex = lut->tex,
+ .ptr = obj.data,
+ ));
+ }
+ } else {
+ // Can't use pl_tex_recreate because of `initial_data`
+ pl_tex_destroy(gpu, &lut->tex);
+ lut->tex = pl_tex_create(gpu, &tex_params);
+ ok = lut->tex;
+ }
+
+ if (!ok) {
+ PL_ERR(sh, "Failed creating LUT texture!");
+ goto error;
+ }
+ break;
+ }
+
+ case SH_LUT_UNIFORM:
+ pl_free(lut->data);
+ lut->data = pl_memdup(NULL, obj.data, obj.size);
+ break;
+
+ case SH_LUT_LITERAL: {
+ lut->str.len = 0;
+ static const char prefix[PL_VAR_TYPE_COUNT] = {
+ [PL_VAR_SINT] = 'i',
+ [PL_VAR_UINT] = 'u',
+ [PL_VAR_FLOAT] = ' ',
+ };
+
+ for (int i = 0; i < size * params->comps; i += params->comps) {
+ if (i > 0)
+ pl_str_append_asprintf_c(lut, &lut->str, ",");
+ if (params->comps > 1) {
+ pl_str_append_asprintf_c(lut, &lut->str, "%cvec%d(",
+ prefix[vartype], params->comps);
+ }
+ for (int c = 0; c < params->comps; c++) {
+ switch (vartype) {
+ case PL_VAR_FLOAT:
+ pl_str_append_asprintf_c(lut, &lut->str, "%s%f",
+ c > 0 ? "," : "",
+ ((float *) obj.data)[i+c]);
+ break;
+ case PL_VAR_UINT:
+ pl_str_append_asprintf_c(lut, &lut->str, "%s%u",
+ c > 0 ? "," : "",
+ ((unsigned int *) obj.data)[i+c]);
+ break;
+ case PL_VAR_SINT:
+ pl_str_append_asprintf_c(lut, &lut->str, "%s%d",
+ c > 0 ? "," : "",
+ ((int *) obj.data)[i+c]);
+ break;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+ if (params->comps > 1)
+ pl_str_append_asprintf_c(lut, &lut->str, ")");
+ }
+ break;
+ }
+
+ case SH_LUT_AUTO:
+ pl_unreachable();
+ }
+
+ lut->type = type;
+ lut->method = method;
+ lut->vartype = vartype;
+ lut->fmt = params->fmt;
+ lut->width = params->width;
+ lut->height = params->height;
+ lut->depth = params->depth;
+ lut->comps = params->comps;
+ lut->signature = params->signature;
+ pl_cache_set(params->cache, &obj);
+ }
+
+ // Done updating, generate the GLSL
+ ident_t name = sh_fresh(sh, "lut");
+ ident_t arr_name = NULL_IDENT;
+
+ static const char * const swizzles[] = {"x", "xy", "xyz", "xyzw"};
+ static const char * const vartypes[PL_VAR_TYPE_COUNT][4] = {
+ [PL_VAR_SINT] = { "int", "ivec2", "ivec3", "ivec4" },
+ [PL_VAR_UINT] = { "uint", "uvec2", "uvec3", "uvec4" },
+ [PL_VAR_FLOAT] = { "float", "vec2", "vec3", "vec4" },
+ };
+
+ switch (type) {
+ case SH_LUT_TEXTURE: {
+ assert(texdim);
+ ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = "weights",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ .binding = {
+ .object = lut->tex,
+ .sample_mode = is_linear ? PL_TEX_SAMPLE_LINEAR
+ : PL_TEX_SAMPLE_NEAREST,
+ }
+ });
+
+ if (is_linear) {
+ ident_t pos_macros[PL_ARRAY_SIZE(sizes)] = {0};
+ for (int i = 0; i < dims; i++)
+ pos_macros[i] = texel_scale(sh, sizes[i], true);
+
+ GLSLH("#define "$"(pos) (textureLod("$", %s(\\\n",
+ name, tex, vartypes[PL_VAR_FLOAT][texdim - 1]);
+
+ for (int i = 0; i < texdim; i++) {
+ char sep = i == 0 ? ' ' : ',';
+ if (pos_macros[i]) {
+ if (dims > 1) {
+ GLSLH(" %c"$"(%s(pos).%c)\\\n", sep, pos_macros[i],
+ vartypes[PL_VAR_FLOAT][dims - 1], "xyzw"[i]);
+ } else {
+ GLSLH(" %c"$"(float(pos))\\\n", sep, pos_macros[i]);
+ }
+ } else {
+ GLSLH(" %c%f\\\n", sep, 0.5);
+ }
+ }
+ GLSLH(" ), 0.0).%s)\n", swizzles[params->comps - 1]);
+ } else {
+ GLSLH("#define "$"(pos) (texelFetch("$", %s(pos",
+ name, tex, vartypes[PL_VAR_SINT][texdim - 1]);
+
+ // Fill up extra components of the index
+ for (int i = dims; i < texdim; i++)
+ GLSLH(", 0");
+
+ GLSLH("), 0).%s)\n", swizzles[params->comps - 1]);
+ }
+ break;
+ }
+
+ case SH_LUT_UNIFORM:
+ arr_name = sh_var(sh, (struct pl_shader_var) {
+ .var = {
+ .name = "weights",
+ .type = vartype,
+ .dim_v = params->comps,
+ .dim_m = 1,
+ .dim_a = size,
+ },
+ .data = lut->data,
+ });
+ break;
+
+ case SH_LUT_LITERAL:
+ arr_name = sh_fresh(sh, "weights");
+ GLSLH("const %s "$"[%d] = %s[](\n ",
+ vartypes[vartype][params->comps - 1], arr_name, size,
+ vartypes[vartype][params->comps - 1]);
+ sh_append_str(sh, SH_BUF_HEADER, lut->str);
+ GLSLH(");\n");
+ break;
+
+ case SH_LUT_AUTO:
+ pl_unreachable();
+ }
+
+ if (arr_name) {
+ GLSLH("#define "$"(pos) ("$"[int((pos)%s)\\\n",
+ name, arr_name, dims > 1 ? "[0]" : "");
+ int shift = params->width;
+ for (int i = 1; i < dims; i++) {
+ GLSLH(" + %d * int((pos)[%d])\\\n", shift, i);
+ shift *= sizes[i];
+ }
+ GLSLH(" ])\n");
+
+ if (is_linear) {
+ pl_assert(dims == 1);
+ pl_assert(vartype == PL_VAR_FLOAT);
+ ident_t arr_lut = name;
+ name = sh_fresh(sh, "lut_lin");
+ GLSLH("%s "$"(float fpos) { \n"
+ " fpos = clamp(fpos, 0.0, 1.0) * %d.0; \n"
+ " float fbase = floor(fpos); \n"
+ " float fceil = ceil(fpos); \n"
+ " float fcoord = fpos - fbase; \n"
+ " return mix("$"(fbase), "$"(fceil), fcoord); \n"
+ "} \n",
+ vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+ size - 1,
+ arr_lut, arr_lut);
+ }
+ }
+
+ if (method == SH_LUT_CUBIC && dims == 3) {
+ ident_t lin_lut = name;
+ name = sh_fresh(sh, "lut_tricubic");
+ GLSLH("%s "$"(vec3 pos) { \n"
+ " vec3 scale = vec3(%d.0, %d.0, %d.0); \n"
+ " vec3 scale_inv = 1.0 / scale; \n"
+ " pos *= scale; \n"
+ " vec3 fpos = fract(pos); \n"
+ " vec3 base = pos - fpos; \n"
+ " vec3 fpos2 = fpos * fpos; \n"
+ " vec3 inv = 1.0 - fpos; \n"
+ " vec3 inv2 = inv * inv; \n"
+ " vec3 w0 = 1.0/6.0 * inv2 * inv; \n"
+ " vec3 w1 = 2.0/3.0 - 0.5 * fpos2 * (2.0 - fpos); \n"
+ " vec3 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n"
+ " vec3 w3 = 1.0/6.0 * fpos2 * fpos; \n"
+ " vec3 g0 = w0 + w1; \n"
+ " vec3 g1 = w2 + w3; \n"
+ " vec3 h0 = scale_inv * ((w1 / g0) - 1.0 + base); \n"
+ " vec3 h1 = scale_inv * ((w3 / g1) + 1.0 + base); \n"
+ " %s c000, c001, c010, c011, c100, c101, c110, c111; \n"
+ " c000 = "$"(h0); \n"
+ " c100 = "$"(vec3(h1.x, h0.y, h0.z)); \n"
+ " c000 = mix(c100, c000, g0.x); \n"
+ " c010 = "$"(vec3(h0.x, h1.y, h0.z)); \n"
+ " c110 = "$"(vec3(h1.x, h1.y, h0.z)); \n"
+ " c010 = mix(c110, c010, g0.x); \n"
+ " c000 = mix(c010, c000, g0.y); \n"
+ " c001 = "$"(vec3(h0.x, h0.y, h1.z)); \n"
+ " c101 = "$"(vec3(h1.x, h0.y, h1.z)); \n"
+ " c001 = mix(c101, c001, g0.x); \n"
+ " c011 = "$"(vec3(h0.x, h1.y, h1.z)); \n"
+ " c111 = "$"(h1); \n"
+ " c011 = mix(c111, c011, g0.x); \n"
+ " c001 = mix(c011, c001, g0.y); \n"
+ " return mix(c001, c000, g0.z); \n"
+ "} \n",
+ vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+ sizes[0] - 1, sizes[1] - 1, sizes[2] - 1,
+ vartypes[PL_VAR_FLOAT][params->comps - 1],
+ lin_lut, lin_lut, lin_lut, lin_lut,
+ lin_lut, lin_lut, lin_lut, lin_lut);
+ }
+
+ if (method == SH_LUT_TETRAHEDRAL) {
+ ident_t int_lut = name;
+ name = sh_fresh(sh, "lut_barycentric");
+ GLSLH("%s "$"(vec3 pos) { \n"
+ // Compute bounding vertices and fractional part
+ " pos = clamp(pos, 0.0, 1.0) * vec3(%d.0, %d.0, %d.0); \n"
+ " vec3 base = floor(pos); \n"
+ " vec3 fpart = pos - base; \n"
+ // v0 and v3 are always 'black' and 'white', respectively
+ // v1 and v2 are the closest RGB and CMY vertices, respectively
+ " ivec3 v0 = ivec3(base), v3 = ivec3(ceil(pos)); \n"
+ " ivec3 v1 = v0, v2 = v3; \n"
+ // Table of boolean checks to simplify following math
+ " bvec3 c = greaterThanEqual(fpart.xyz, fpart.yzx); \n"
+ " bool c_xy = c.x, c_yx = !c.x, \n"
+ " c_yz = c.y, c_zy = !c.y, \n"
+ " c_zx = c.z, c_xz = !c.z; \n"
+ " vec3 s = fpart.xyz; \n"
+ " bool cond; \n",
+ vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+ sizes[0] - 1, sizes[1] - 1, sizes[2] - 1);
+
+ // Subdivision of the cube into six congruent tetrahedras
+ //
+ // For each tetrahedron, test if the point is inside, and if so, update
+ // the edge vertices. We test all six, even though only one case will
+ // ever be true, because this avoids branches.
+ static const char *indices[] = { "xyz", "xzy", "zxy", "zyx", "yzx", "yxz"};
+ for (int i = 0; i < PL_ARRAY_SIZE(indices); i++) {
+ const char x = indices[i][0], y = indices[i][1], z = indices[i][2];
+ GLSLH("cond = c_%c%c && c_%c%c; \n"
+ "s = cond ? fpart.%c%c%c : s; \n"
+ "v1.%c = cond ? v3.%c : v1.%c; \n"
+ "v2.%c = cond ? v0.%c : v2.%c; \n",
+ x, y, y, z,
+ x, y, z,
+ x, x, x,
+ z, z, z);
+ }
+
+ // Interpolate in barycentric coordinates, with four texel fetches
+ GLSLH(" return (1.0 - s.x) * "$"(v0) + \n"
+ " (s.x - s.y) * "$"(v1) + \n"
+ " (s.y - s.z) * "$"(v2) + \n"
+ " (s.z) * "$"(v3); \n"
+ "} \n",
+ int_lut, int_lut, int_lut, int_lut);
+ }
+
+ lut->error = false;
+ pl_cache_obj_free(&obj);
+ pl_assert(name);
+ return name;
+
+error:
+ lut->error = true;
+ pl_cache_obj_free(&obj);
+ return NULL_IDENT;
+}
diff --git a/src/shaders/meson.build b/src/shaders/meson.build
new file mode 100644
index 0000000..746747c
--- /dev/null
+++ b/src/shaders/meson.build
@@ -0,0 +1,23 @@
+shader_sources = [
+ 'colorspace.c',
+ 'custom.c',
+ 'custom_mpv.c',
+ 'deinterlacing.c',
+ 'dithering.c',
+ 'film_grain.c',
+ 'film_grain_av1.c',
+ 'film_grain_h274.c',
+ 'icc.c',
+ 'lut.c',
+ 'sampling.c',
+]
+
+foreach s : shader_sources
+ sources += custom_target(s,
+ command: glsl_preproc,
+ depend_files: glsl_deps,
+ env: python_env,
+ input: s,
+ output: s,
+ )
+endforeach
diff --git a/src/shaders/sampling.c b/src/shaders/sampling.c
new file mode 100644
index 0000000..fc10f80
--- /dev/null
+++ b/src/shaders/sampling.c
@@ -0,0 +1,1198 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders/sampling.h>
+
+const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS };
+
+static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
+{
+ if (src->tex)
+ return src->tex->params;
+
+ return (struct pl_tex_params) {
+ .w = src->tex_w,
+ .h = src->tex_h,
+ };
+}
+
+enum filter {
+ NEAREST = PL_TEX_SAMPLE_NEAREST,
+ LINEAR = PL_TEX_SAMPLE_LINEAR,
+ BEST,
+ FASTEST,
+};
+
+// Helper function to compute the src/dst sizes and upscaling ratios
+static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
+ ident_t *src_tex, ident_t *pos, ident_t *pt,
+ float *ratio_x, float *ratio_y, uint8_t *comp_mask,
+ float *scale, bool resizeable,
+ enum filter filter)
+{
+ enum pl_shader_sig sig;
+ float src_w, src_h;
+ enum pl_tex_sample_mode sample_mode;
+ if (src->tex) {
+ pl_fmt fmt = src->tex->params.format;
+ bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
+ pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
+ sig = PL_SHADER_SIG_NONE;
+ src_w = pl_rect_w(src->rect);
+ src_h = pl_rect_h(src->rect);
+ switch (filter) {
+ case FASTEST:
+ case NEAREST:
+ sample_mode = PL_TEX_SAMPLE_NEAREST;
+ break;
+ case LINEAR:
+ if (!can_linear) {
+ SH_FAIL(sh, "Trying to use a shader that requires linear "
+ "sampling with a texture whose format (%s) does not "
+ "support PL_FMT_CAP_LINEAR", fmt->name);
+ return false;
+ }
+ sample_mode = PL_TEX_SAMPLE_LINEAR;
+ break;
+ case BEST:
+ sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
+ break;
+ }
+ } else {
+ pl_assert(src->tex_w && src->tex_h);
+ sig = PL_SHADER_SIG_SAMPLER;
+ src_w = src->sampled_w;
+ src_h = src->sampled_h;
+ if (filter == BEST || filter == FASTEST) {
+ sample_mode = src->mode;
+ } else {
+ sample_mode = (enum pl_tex_sample_mode) filter;
+ if (sample_mode != src->mode) {
+ SH_FAIL(sh, "Trying to use a shader that requires a different "
+ "filter mode than the external sampler.");
+ return false;
+ }
+ }
+ }
+
+ src_w = PL_DEF(src_w, src_params(src).w);
+ src_h = PL_DEF(src_h, src_params(src).h);
+ pl_assert(src_w && src_h);
+
+ int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
+ int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
+ pl_assert(out_w && out_h);
+
+ if (ratio_x)
+ *ratio_x = out_w / fabs(src_w);
+ if (ratio_y)
+ *ratio_y = out_h / fabs(src_h);
+ if (scale)
+ *scale = PL_DEF(src->scale, 1.0);
+
+ if (comp_mask) {
+ uint8_t tex_mask = 0x0Fu;
+ if (src->tex) {
+ // Mask containing only the number of components in the texture
+ tex_mask = (1 << src->tex->params.format->num_components) - 1;
+ }
+
+ uint8_t src_mask = src->component_mask;
+ if (!src_mask)
+ src_mask = (1 << PL_DEF(src->components, 4)) - 1;
+
+ // Only actually sample components that are both requested and
+ // available in the texture being sampled
+ *comp_mask = tex_mask & src_mask;
+ }
+
+ if (resizeable)
+ out_w = out_h = 0;
+ if (!sh_require(sh, sig, out_w, out_h))
+ return false;
+
+ if (src->tex) {
+ pl_rect2df rect = {
+ .x0 = src->rect.x0,
+ .y0 = src->rect.y0,
+ .x1 = src->rect.x0 + src_w,
+ .y1 = src->rect.y0 + src_h,
+ };
+
+ *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
+ "src_tex", &rect, pos, pt);
+ } else {
+ if (pt) {
+ float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
+ if (src->sampler == PL_SAMPLER_RECT)
+ sx = sy = 1.0;
+
+ *pt = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tex_pt"),
+ .data = &(float[2]) { sx, sy },
+ });
+ }
+
+ sh->sampler_type = src->sampler;
+
+ pl_assert(src->format);
+ switch (src->format) {
+ case PL_FMT_UNKNOWN:
+ case PL_FMT_FLOAT:
+ case PL_FMT_UNORM:
+ case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
+ case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
+ case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
+ case PL_FMT_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ *src_tex = sh_fresh(sh, "src_tex");
+ *pos = sh_fresh(sh, "pos");
+
+ GLSLH("#define "$" src_tex \n"
+ "#define "$" pos \n",
+ *src_tex, *pos);
+ }
+
+ return true;
+}
+
+void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_deband_params *params)
+{
+ float scale;
+ ident_t tex, pos, pt;
+ uint8_t mask;
+ if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, LINEAR))
+ return;
+
+ params = PL_DEF(params, &pl_deband_default_params);
+ sh_describe(sh, "debanding");
+ GLSL("vec4 color; \n"
+ "// pl_shader_deband \n"
+ "{ \n"
+ "vec2 pos = "$", pt = "$"; \n"
+ "color = textureLod("$", pos, 0.0);\n",
+ pos, pt, tex);
+
+ mask &= ~0x8u; // ignore alpha channel
+ uint8_t num_comps = sh_num_comps(mask);
+ const char *swiz = sh_swizzle(mask);
+ pl_assert(num_comps <= 3);
+ if (!num_comps) {
+ GLSL("color *= "$"; \n"
+ "} \n",
+ SH_FLOAT(scale));
+ return;
+ }
+
+ GLSL("#define GET(X, Y) \\\n"
+ " (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s) \n"
+ "#define T %s \n",
+ tex, swiz, sh_float_type(mask));
+
+ ident_t prng = sh_prng(sh, true, NULL);
+ GLSL("T avg, diff, bound; \n"
+ "T res = color.%s; \n"
+ "vec2 d; \n",
+ swiz);
+
+ if (params->iterations > 0) {
+ ident_t radius = sh_const_float(sh, "radius", params->radius);
+ ident_t threshold = sh_const_float(sh, "threshold",
+ params->threshold / (1000 * scale));
+
+ // For each iteration, compute the average at a given distance and
+ // pick it instead of the color if the difference is below the threshold.
+ for (int i = 1; i <= params->iterations; i++) {
+ GLSL(// Compute a random angle and distance
+ "d = "$".xy * vec2(%d.0 * "$", %f); \n"
+ "d = d.x * vec2(cos(d.y), sin(d.y)); \n"
+ // Sample at quarter-turn intervals around the source pixel
+ "avg = T(0.0); \n"
+ "avg += GET(+d.x, +d.y); \n"
+ "avg += GET(-d.x, +d.y); \n"
+ "avg += GET(-d.x, -d.y); \n"
+ "avg += GET(+d.x, -d.y); \n"
+ "avg *= 0.25; \n"
+ // Compare the (normalized) average against the pixel
+ "diff = abs(res - avg); \n"
+ "bound = T("$" / %d.0); \n",
+ prng, i, radius, M_PI * 2,
+ threshold, i);
+
+ if (num_comps > 1) {
+ GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
+ } else {
+ GLSL("res = mix(avg, res, diff > bound); \n");
+ }
+ }
+ }
+
+ // Add some random noise to smooth out residual differences
+ if (params->grain > 0) {
+ // Avoid adding grain near true black
+ GLSL("bound = T(\n");
+ for (int c = 0; c < num_comps; c++) {
+ GLSL("%c"$, c > 0 ? ',' : ' ',
+ SH_FLOAT(params->grain_neutral[c] / scale));
+ }
+ GLSL("); \n"
+ "T strength = min(abs(res - bound), "$"); \n"
+ "res += strength * (T("$") - T(0.5)); \n",
+ SH_FLOAT(params->grain / (1000.0 * scale)), prng);
+ }
+
+ GLSL("color.%s = res; \n"
+ "color *= "$"; \n"
+ "#undef T \n"
+ "#undef GET \n"
+ "} \n",
+ swiz, SH_FLOAT(scale));
+}
+
+bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST))
+ return false;
+
+ GLSL("// pl_shader_sample_direct \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, NEAREST))
+ return false;
+
+ sh_describe(sh, "nearest");
+ GLSL("// pl_shader_sample_nearest \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR))
+ return false;
+
+ sh_describe(sh, "bilinear");
+ GLSL("// pl_shader_sample_bilinear \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ // Explanation of how bicubic scaling with only 4 texel fetches is done:
+ // http://www.mate.tue.nl/mate/pdfs/10318.pdf
+ // 'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+
+ sh_describe(sh, "bicubic");
+#pragma GLSL /* pl_shader_sample_bicubic */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ vec2 frac2 = frac * frac; \
+ vec2 inv = vec2(1.0) - frac; \
+ vec2 inv2 = inv * inv; \
+ /* compute filter weights directly */ \
+ vec2 w0 = 1.0/6.0 * inv2 * inv; \
+ vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+ vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \
+ vec2 w3 = 1.0/6.0 * frac2 * frac; \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g + inv.xyxy; \
+ h.xy -= vec2(2.0); \
+ /* sample four corners, then interpolate */ \
+ vec4 p = pos.xyxy + $pt.xyxy * h; \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = ${float:scale} * mix(c1, c0, g.x); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast hermite sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ sh_describe(sh, "hermite");
+#pragma GLSL /* pl_shader_sample_hermite */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ pos += $pt * (smoothstep(0.0, 1.0, frac) - frac); \
+ color = ${float:scale} * textureLod($tex, pos, 0.0); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ sh_describe(sh, "gaussian");
+#pragma GLSL /* pl_shader_sample_gaussian */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 off = -fract(pos * size + vec2(0.5)); \
+ vec2 off2 = -2.0 * off * off; \
+ /* compute gaussian weights */ \
+ vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0)); \
+ vec2 w1 = exp(off2); \
+ vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0)); \
+ vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0)); \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g; \
+ h.xy -= vec2(1.0); \
+ h.zw += vec2(1.0); \
+ g.xy /= g.xy + g.zw; /* explicitly normalize */ \
+ /* sample four corners, then interpolate */ \
+ vec4 p = pos.xyxy + $pt.xyxy * (h + off.xyxy); \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = ${float:scale} * mix(c1, c0, g.x); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
+ float threshold)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ threshold = PL_CLAMP(threshold, 0.0f, 0.5f);
+ sh_describe(sh, "oversample");
+ #pragma GLSL /* pl_shader_sample_oversample */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ /* Round the position to the nearest pixel */ \
+ vec2 fcoord = fract(pos * size - vec2(0.5)); \
+ float rx = ${dynamic float:rx}; \
+ float ry = ${dynamic float:ry}; \
+ vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry); \
+ coeff = clamp(coeff + vec2(0.5), 0.0, 1.0); \
+ @if (threshold > 0) { \
+ float thresh = ${float:threshold}; \
+ coeff = mix(coeff, vec2(0.0), \
+ lessThan(coeff, vec2(thresh))); \
+ coeff = mix(coeff, vec2(1.0), \
+ greaterThan(coeff, vec2(1.0 - thresh))); \
+ @} \
+ \
+ /* Compute the right output blend of colors */ \
+ pos += (coeff - fcoord) * $pt; \
+ color = ${float:scale} * textureLod($tex, pos, 0.0); \
+ }
+
+ return true;
+}
+
+static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg,
+ const char *stage, float rx, float ry)
+{
+ const char *dir;
+ if (rx > 1 && ry > 1) {
+ dir = "up";
+ } else if (rx < 1 && ry < 1) {
+ dir = "down";
+ } else if (rx == 1 && ry == 1) {
+ dir = "noop";
+ } else {
+ dir = "ana";
+ }
+
+ if (cfg->name) {
+ sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name);
+ } else if (cfg->window) {
+ sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir,
+ PL_DEF(cfg->kernel->name, "unknown"),
+ PL_DEF(cfg->window->name, "unknown"));
+ } else {
+ sh_describef(sh, "%s %sscaling (%s)", stage, dir,
+ PL_DEF(cfg->kernel->name, "unknown"));
+ }
+}
+
+// Subroutine for computing and adding an individual texel contribution
+// If `in` is NULL, samples directly
+// If `in` is set, takes the pixel from inX[idx] where X is the component,
+// `in` is the given identifier, and `idx` must be defined by the caller
+static void polar_sample(pl_shader sh, pl_filter filter,
+ ident_t tex, ident_t lut, ident_t radius,
+ int x, int y, uint8_t comp_mask, ident_t in,
+ bool use_ar, ident_t scale)
+{
+ // Since we can't know the subpixel position in advance, assume a
+ // worst case scenario
+ int yy = y > 0 ? y-1 : y;
+ int xx = x > 0 ? x-1 : x;
+ float dmin = sqrt(xx*xx + yy*yy);
+ // Skip samples definitely outside the radius
+ if (dmin >= filter->radius)
+ return;
+
+ // Check for samples that might be skippable
+ bool maybe_skippable = dmin >= filter->radius - M_SQRT2;
+
+ // Check for samples that definitely won't contribute to anti-ringing
+ const float ar_radius = filter->radius_zero;
+ use_ar &= dmin < ar_radius;
+
+#pragma GLSL \
+ offset = ivec2(${const int: x}, ${const int: y}); \
+ d = length(vec2(offset) - fcoord); \
+ @if (maybe_skippable) \
+ if (d < $radius) { \
+ w = $lut(d * 1.0 / $radius); \
+ wsum += w; \
+ @if (in != NULL_IDENT) { \
+ @for (c : comp_mask) \
+ c[@c] = ${in}_@c[idx]; \
+ @} else { \
+ c = textureLod($tex, base + pt * vec2(offset), 0.0); \
+ @} \
+ @for (c : comp_mask) \
+ color[@c] += w * c[@c]; \
+ @if (use_ar) { \
+ if (d <= ${const float: ar_radius}) { \
+ @for (c : comp_mask) { \
+ cc = vec2($scale * c[@c]); \
+ cc.x = 1.0 - cc.x; \
+ ww = cc + vec2(0.10); \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = w * ww; \
+ ar@c += ww * cc; \
+ wwsum@c += ww; \
+ @} \
+ } \
+ @} \
+ @if (maybe_skippable) \
+ }
+}
+
+struct sh_sampler_obj {
+ pl_filter filter;
+ pl_shader_obj lut;
+ pl_shader_obj pass2; // for pl_shader_sample_ortho
+};
+
+#define SCALER_LUT_SIZE 256
+#define SCALER_LUT_CUTOFF 1e-3f
+
+static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_sampler_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->lut);
+ pl_shader_obj_destroy(&obj->pass2);
+ pl_filter_free(&obj->filter);
+ *obj = (struct sh_sampler_obj) {0};
+}
+
+static void fill_polar_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct sh_sampler_obj *obj = params->priv;
+ pl_filter filt = obj->filter;
+
+ pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
+ memcpy(data, filt->weights, params->width * sizeof(float));
+}
+
+bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params)
+{
+ pl_assert(params);
+ if (!params->filter.polar) {
+ SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
+ return false;
+ }
+
+ uint8_t cmask;
+ float rx, ry, scalef;
+ ident_t src_tex, pos, pt, scale;
+ if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST))
+ return false;
+
+ struct sh_sampler_obj *obj;
+ obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
+ sh_sampler_uninit);
+ if (!obj)
+ return false;
+
+ float inv_scale = 1.0 / PL_MIN(rx, ry);
+ inv_scale = PL_MAX(inv_scale, 1.0);
+ if (params->no_widening)
+ inv_scale = 1.0;
+ scale = sh_const_float(sh, "scale", scalef);
+
+ struct pl_filter_config cfg = params->filter;
+ cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+ cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+ bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+ if (update) {
+ pl_filter_free(&obj->filter);
+ obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+ .config = cfg,
+ .lut_entries = SCALER_LUT_SIZE,
+ .cutoff = SCALER_LUT_CUTOFF,
+ ));
+
+ if (!obj->filter) {
+ // This should never happen, but just in case ..
+ SH_FAIL(sh, "Failed initializing polar filter!");
+ return false;
+ }
+ }
+
+ describe_filter(sh, &cfg, "polar", rx, ry);
+ GLSL("// pl_shader_sample_polar \n"
+ "vec4 color = vec4(0.0); \n"
+ "{ \n"
+ "vec2 pos = "$", pt = "$"; \n"
+ "vec2 size = vec2(textureSize("$", 0)); \n"
+ "vec2 fcoord = fract(pos * size - vec2(0.5)); \n"
+ "vec2 base = pos - pt * fcoord; \n"
+ "vec2 center = base + pt * vec2(0.5); \n"
+ "ivec2 offset; \n"
+ "float w, d, wsum = 0.0; \n"
+ "int idx; \n"
+ "vec4 c; \n",
+ pos, pt, src_tex);
+
+ bool use_ar = cfg.antiring > 0;
+ if (use_ar) {
+#pragma GLSL \
+ vec2 ww, cc; \
+ @for (c : cmask) \
+ vec2 ar@c = vec2(0.0), wwsum@c = vec2(0.0);
+ }
+
+ int bound = ceil(obj->filter->radius);
+ int offset = bound - 1; // padding top/left
+ int padding = offset + bound; // total padding
+
+ // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
+ // good tradeoff for the horizontal work group size. Apart from that,
+ // just use as many threads as possible.
+ const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
+
+ // We need to sample everything from base_min to base_max, so make sure we
+ // have enough room in shmem. The extra margin on the ceilf guards against
+ // floating point inaccuracy on near-integer scaling ratios.
+ const float margin = 1e-5;
+ int iw = (int) ceilf(bw / rx - margin) + padding + 1,
+ ih = (int) ceilf(bh / ry - margin) + padding + 1;
+ int sizew = iw, sizeh = ih;
+
+ pl_gpu gpu = SH_GPU(sh);
+ bool dynamic_size = SH_PARAMS(sh).dynamic_constants ||
+ !gpu || !gpu->limits.array_size_constants;
+ if (dynamic_size) {
+ // Overallocate the array slightly to reduce recompilation overhead
+ sizew = PL_ALIGN2(sizew, 8);
+ sizeh = PL_ALIGN2(sizeh, 8);
+ }
+
+ int num_comps = __builtin_popcount(cmask);
+ int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float);
+ bool is_compute = !params->no_compute && sh_glsl(sh).compute &&
+ sh_try_compute(sh, bw, bh, false, shmem_req);
+
+ // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by
+ // much, and it's catastrophically slow on other platforms.
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .lut_type = SH_LUT_TEXTURE,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = SCALER_LUT_SIZE,
+ .comps = 1,
+ .update = update,
+ .fill = fill_polar_lut,
+ .priv = obj,
+ ));
+
+ if (!lut) {
+ SH_FAIL(sh, "Failed initializing polar LUT!");
+ return false;
+ }
+
+ ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
+ ident_t in = sh_fresh(sh, "in");
+
+ if (is_compute) {
+
+ // Compute shader kernel
+ GLSL("uvec2 base_id = uvec2(0u); \n");
+ if (src->rect.x0 > src->rect.x1)
+ GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n");
+ if (src->rect.y0 > src->rect.y1)
+ GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n");
+
+ GLSLH("shared vec2 "$"_base; \n", in);
+ GLSL("if (gl_LocalInvocationID.xy == base_id) \n"
+ " "$"_base = base; \n"
+ "barrier(); \n"
+ "ivec2 rel = ivec2(round((base - "$"_base) * size)); \n",
+ in, in);
+
+ ident_t sizew_c = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .compile_time = true,
+ .name = "sizew",
+ .data = &sizew,
+ });
+
+ ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .compile_time = true,
+ .name = "sizeh",
+ .data = &sizeh,
+ });
+
+ ident_t iw_c = sizew_c, ih_c = sizeh_c;
+ if (dynamic_size) {
+ iw_c = sh_const_int(sh, "iw", iw);
+ ih_c = sh_const_int(sh, "ih", ih);
+ }
+
+ // Load all relevant texels into shmem
+ GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) { \n"
+ "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) { \n"
+ "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0); \n",
+ ih_c, bh, iw_c, bw, src_tex, in, offset, offset);
+
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c);
+ GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c);
+ comps &= ~(1 << c);
+ }
+
+ GLSL("}} \n"
+ "barrier(); \n");
+
+ // Dispatch the actual samples
+ for (int y = 1 - bound; y <= bound; y++) {
+ for (int x = 1 - bound; x <= bound; x++) {
+ GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n",
+ sizew_c, sizew_c, y + offset, x + offset);
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x, y, cmask, in, use_ar, scale);
+ }
+ }
+ } else {
+ // Fragment shader sampling
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ GLSL("vec4 "$"_%d; \n", in, c);
+ comps &= ~(1 << c);
+ }
+
+ // For maximum efficiency, we want to use textureGather() if
+ // possible, rather than direct sampling. Since this is not
+ // always possible/sensible, we need to possibly intermix gathering
+ // with regular sampling. This requires keeping track of which
+ // pixels in the next row were already gathered by the previous
+ // row.
+ uint32_t gathered_cur = 0x0, gathered_next = 0x0;
+ const float radius2 = PL_SQUARE(obj->filter->radius);
+ const int base = bound - 1;
+
+ if (base + bound >= 8 * sizeof(gathered_cur)) {
+ SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
+ obj->filter->radius);
+ return false;
+ }
+
+ for (int y = 1 - bound; y <= bound; y++) {
+ for (int x = 1 - bound; x <= bound; x++) {
+ // Skip already gathered texels
+ uint32_t bit = 1llu << (base + x);
+ if (gathered_cur & bit)
+ continue;
+
+ // Using texture gathering is only more efficient than direct
+ // sampling in the case where we expect to be able to use all
+ // four gathered texels, without having to discard any. So
+ // only do it if we suspect it will be a win rather than a
+ // loss.
+ int xx = x*x, xx1 = (x+1)*(x+1);
+ int yy = y*y, yy1 = (y+1)*(y+1);
+ bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
+ use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
+ use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
+ use_gather &= !src->tex || src->tex->params.format->gatherable;
+
+ // Gathering from components other than the R channel requires
+ // support for GLSL 400, which introduces the overload of
+ // textureGather* that allows specifying the component.
+ //
+ // This is also the minimum requirement if we don't know the
+ // texture format capabilities, for the sampler2D interface
+ if (cmask != 0x1 || !src->tex)
+ use_gather &= sh_glsl(sh).version >= 400;
+
+ if (!use_gather) {
+ // Switch to direct sampling instead
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x, y, cmask, NULL_IDENT, use_ar, scale);
+ continue;
+ }
+
+ // Gather the four surrounding texels simultaneously
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ if (x || y) {
+ if (c) {
+ GLSL($"_%d = textureGatherOffset("$", "
+ "center, ivec2(%d, %d), %d); \n",
+ in, c, src_tex, x, y, c);
+ } else {
+ GLSL($"_0 = textureGatherOffset("$", "
+ "center, ivec2(%d, %d)); \n",
+ in, src_tex, x, y);
+ }
+ } else {
+ if (c) {
+ GLSL($"_%d = textureGather("$", center, %d); \n",
+ in, c, src_tex, c);
+ } else {
+ GLSL($"_0 = textureGather("$", center); \n",
+ in, src_tex);
+ }
+ }
+ comps &= ~(1 << c);
+ }
+
+ // Mix in all of the points with their weights
+ for (int p = 0; p < 4; p++) {
+ // The four texels are gathered counterclockwise starting
+ // from the bottom left
+ static const int xo[4] = {0, 1, 1, 0};
+ static const int yo[4] = {1, 1, 0, 0};
+ if (x+xo[p] > bound || y+yo[p] > bound)
+ continue; // next subpixel
+
+ GLSL("idx = %d;\n", p);
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x+xo[p], y+yo[p], cmask, in, use_ar, scale);
+ }
+
+ // Mark the other next row's pixels as already gathered
+ gathered_next |= bit | (bit << 1);
+ x++; // skip adjacent pixel
+ }
+
+ // Prepare for new row
+ gathered_cur = gathered_next;
+ gathered_next = 0;
+ }
+ }
+
+#pragma GLSL \
+ color = $scale / wsum * color; \
+ @if (use_ar) { \
+ @for (c : cmask) { \
+ ww = ar@c / wwsum@c; \
+ ww.x = 1.0 - ww.x; \
+ w = clamp(color[@c], ww.x, ww.y); \
+ w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y); \
+ color[@c] = mix(color[@c], w, ${float:cfg.antiring}); \
+ @} \
+ @} \
+ @if (!(cmask & (1 << PL_CHANNEL_A))) \
+ color.a = 1.0; \
+ }
+
+ return true;
+}
+
+static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct sh_sampler_obj *obj = params->priv;
+ pl_filter filt = obj->filter;
+
+ if (filt->radius == filt->radius_zero) {
+ // Main lobe covers entire radius, so all weights are positive, meaning
+ // we can use the linear resampling trick
+ for (int n = 0; n < SCALER_LUT_SIZE; n++) {
+ const float *weights = filt->weights + n * filt->row_stride;
+ float *row = (float *) data + n * filt->row_stride;
+ pl_assert(filt->row_size % 2 == 0);
+ for (int i = 0; i < filt->row_size; i += 2) {
+ const float w0 = weights[i], w1 = weights[i+1];
+ assert(w0 + w1 >= 0.0f);
+ row[i] = w0 + w1;
+ row[i+1] = w1 / (w0 + w1);
+ }
+ }
+ } else {
+ size_t entries = SCALER_LUT_SIZE * filt->row_stride;
+ pl_assert(params->width * params->height * params->comps == entries);
+ memcpy(data, filt->weights, entries * sizeof(float));
+ }
+}
+
+enum {
+ SEP_VERT = 0,
+ SEP_HORIZ,
+ SEP_PASSES
+};
+
+bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params)
+{
+ pl_assert(params);
+ if (params->filter.polar) {
+ SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
+ return false;
+ }
+
+ pl_gpu gpu = SH_GPU(sh);
+ pl_assert(gpu);
+
+ uint8_t comps;
+ float ratio[SEP_PASSES], scale;
+ ident_t src_tex, pos, pt;
+ if (!setup_src(sh, src, &src_tex, &pos, &pt,
+ &ratio[SEP_HORIZ], &ratio[SEP_VERT],
+ &comps, &scale, false, LINEAR))
+ return false;
+
+
+ int pass;
+ if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) {
+ pass = SEP_VERT;
+ } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) {
+ pass = SEP_HORIZ;
+ } else {
+ SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a "
+ "pl_sample_src that requires scaling in multiple directions "
+ "(rx=%f, ry=%f), this is not possible!",
+ ratio[SEP_HORIZ], ratio[SEP_VERT]);
+ return false;
+ }
+
+ // We can store a separate sampler object per dimension, so dispatch the
+ // right one. This is needed for two reasons:
+ // 1. Anamorphic content can have a different scaling ratio for each
+ // dimension. In particular, you could be upscaling in one and
+ // downscaling in the other.
+ // 2. After fixing the source for `setup_src`, we lose information about
+ // the scaling ratio of the other component. (Although this is only a
+ // minor reason and could easily be changed with some boilerplate)
+ struct sh_sampler_obj *obj;
+ obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
+ struct sh_sampler_obj, sh_sampler_uninit);
+ if (!obj)
+ return false;
+
+ if (pass != 0) {
+ obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
+ struct sh_sampler_obj, sh_sampler_uninit);
+ assert(obj);
+ }
+
+ float inv_scale = 1.0 / ratio[pass];
+ inv_scale = PL_MAX(inv_scale, 1.0);
+ if (params->no_widening)
+ inv_scale = 1.0;
+
+ struct pl_filter_config cfg = params->filter;
+ cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+ cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+ bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+
+ if (update) {
+ pl_filter_free(&obj->filter);
+ obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+ .config = cfg,
+ .lut_entries = SCALER_LUT_SIZE,
+ .max_row_size = gpu->limits.max_tex_2d_dim / 4,
+ .row_stride_align = 4,
+ ));
+
+ if (!obj->filter) {
+ // This should never happen, but just in case ..
+ SH_FAIL(sh, "Failed initializing separated filter!");
+ return false;
+ }
+ }
+
+ int N = obj->filter->row_size; // number of samples to convolve
+ int width = obj->filter->row_stride / 4; // width of the LUT texture
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = width,
+ .height = SCALER_LUT_SIZE,
+ .comps = 4,
+ .update = update,
+ .fill = fill_ortho_lut,
+ .priv = obj,
+ ));
+ if (!lut) {
+ SH_FAIL(sh, "Failed initializing separated LUT!");
+ return false;
+ }
+
+ const int dir[SEP_PASSES][2] = {
+ [SEP_HORIZ] = {1, 0},
+ [SEP_VERT] = {0, 1},
+ };
+
+ static const char *names[SEP_PASSES] = {
+ [SEP_HORIZ] = "ortho (horiz)",
+ [SEP_VERT] = "ortho (vert)",
+ };
+
+ describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]);
+
+ float denom = PL_MAX(1, width - 1); // avoid division by zero
+ bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0;
+ bool use_linear = obj->filter->radius == obj->filter->radius_zero;
+ use_ar &= !use_linear; // filter has no negative weights
+
+#pragma GLSL /* pl_shader_sample_ortho */ \
+ vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \
+ { \
+ vec2 pos = $pos, pt = $pt; \
+ vec2 size = vec2(textureSize($src_tex, 0)); \
+ vec2 dir = vec2(${const float:dir[pass][0]}, ${const float: dir[pass][1]}); \
+ pt *= dir; \
+ vec2 fcoord2 = fract(pos * size - vec2(0.5)); \
+ float fcoord = dot(fcoord2, dir); \
+ vec2 base = pos - fcoord * pt - pt * vec2(${const float: N / 2 - 1}); \
+ vec4 ws; \
+ float off; \
+ ${vecType: comps} c, ca = ${vecType: comps}(0.0); \
+ @if (use_ar) { \
+ ${vecType: comps} hi = ${vecType: comps}(0.0); \
+ ${vecType: comps} lo = ${vecType: comps}(1e9); \
+ @} \
+ @for (n < N) { \
+ @if @(n % 4 == 0) \
+ ws = $lut(vec2(float(@n / 4) / ${const float: denom}, fcoord)); \
+ @if @(vars.use_ar && (n == vars.n / 2 - 1 || n == vars.n / 2)) { \
+ c = textureLod($src_tex, base + pt * @n.0, 0.0).${swizzle: comps}; \
+ ca += ws[@n % 4] * c; \
+ lo = min(lo, c); \
+ hi = max(hi, c); \
+ @} else { \
+ @if (use_linear) { \
+ @if @(n % 2 == 0) { \
+ off = @n.0 + ws[@n % 4 + 1]; \
+ ca += ws[@n % 4] * textureLod($src_tex, base + pt * off, \
+ 0.0).${swizzle: comps}; \
+ @} \
+ @} else { \
+ ca += ws[@n % 4] * textureLod($src_tex, base + pt * @n.0, \
+ 0.0).${swizzle: comps}; \
+ @} \
+ @} \
+ @} \
+ @if (use_ar) \
+ ca = mix(ca, clamp(ca, lo, hi), ${float: cfg.antiring}); \
+ color.${swizzle: comps} = ${float: scale} * ca; \
+ }
+
+ return true;
+}
+
+const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS };
+
+void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h,
+ const struct pl_distort_params *params)
+{
+ pl_assert(params);
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+ return;
+
+ const int src_w = src_tex->params.w, src_h = src_tex->params.h;
+ float rx = 1.0f, ry = 1.0f;
+ if (src_w > src_h) {
+ ry = (float) src_h / src_w;
+ } else {
+ rx = (float) src_w / src_h;
+ }
+
+ // Map from texel coordinates [0,1]² to aspect-normalized representation
+ const pl_transform2x2 tex2norm = {
+ .mat.m = {
+ { 2 * rx, 0 },
+ { 0, -2 * ry },
+ },
+ .c = { -rx, ry },
+ };
+
+ // Map from aspect-normalized representation to canvas coords [-1,1]²
+ const float sx = params->unscaled ? (float) src_w / out_w : 1.0f;
+ const float sy = params->unscaled ? (float) src_h / out_h : 1.0f;
+ const pl_transform2x2 norm2canvas = {
+ .mat.m = {
+ { sx / rx, 0 },
+ { 0, sy / ry },
+ },
+ };
+
+ struct pl_transform2x2 transform = params->transform;
+ pl_transform2x2_mul(&transform, &tex2norm);
+ pl_transform2x2_rmul(&norm2canvas, &transform);
+
+ if (params->constrain) {
+ pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) {
+ .x1 = 1, .y1 = 1,
+ });
+ const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f);
+ pl_transform2x2_scale(&transform, 2.0f / k);
+ };
+
+ // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond
+ // to normal mathematical axis conventions
+ static const pl_rect2df canvas = {
+ .x0 = -1.0f, .x1 = 1.0f,
+ .y0 = 1.0f, .y1 = -1.0f,
+ };
+
+ ident_t pos = sh_attr_vec2(sh, "pos", &canvas);
+ ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode,
+ PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt);
+
+ // Bind the inverse of the tex2canvas transform (i.e. canvas2tex)
+ pl_transform2x2_invert(&transform);
+ ident_t tf = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat2("tf"),
+ .data = PL_TRANSPOSE_2X2(transform.mat.m),
+ });
+
+ ident_t tf_c = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tf_c"),
+ .data = transform.c,
+ });
+
+ // See pl_shader_sample_bicubic
+ sh_describe(sh, "distortion");
+#pragma GLSL /* pl_shader_sample_distort */ \
+ vec4 color; \
+ { \
+ vec2 pos = $tf * $pos + $tf_c; \
+ vec2 pt = $pt; \
+ @if (params->bicubic) { \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ vec2 frac2 = frac * frac; \
+ vec2 inv = vec2(1.0) - frac; \
+ vec2 inv2 = inv * inv; \
+ vec2 w0 = 1.0/6.0 * inv2 * inv; \
+ vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+ vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \
+ vec2 w3 = 1.0/6.0 * frac2 * frac; \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g + inv.xyxy; \
+ h.xy -= vec2(2.0); \
+ vec4 p = pos.xyxy + pt.xyxy * h; \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = mix(c1, c0, g.x); \
+ @} else { \
+ color = texture($tex, pos); \
+ @} \
+ @if (params->alpha_mode) { \
+ vec2 border = min(pos, vec2(1.0) - pos); \
+ border = smoothstep(vec2(0.0), pt, border); \
+ @if (params->alpha_mode == PL_ALPHA_PREMULTIPLIED) \
+ color.rgba *= border.x * border.y; \
+ @else \
+ color.a *= border.x * border.y; \
+ @} \
+ }
+
+}