diff options
Diffstat (limited to '')
-rw-r--r-- | src/shaders.c | 992 | ||||
-rw-r--r-- | src/shaders.h | 387 | ||||
-rw-r--r-- | src/shaders/colorspace.c | 2120 | ||||
-rw-r--r-- | src/shaders/custom.c | 89 | ||||
-rw-r--r-- | src/shaders/custom_mpv.c | 1768 | ||||
-rw-r--r-- | src/shaders/deinterlacing.c | 260 | ||||
-rw-r--r-- | src/shaders/dithering.c | 527 | ||||
-rw-r--r-- | src/shaders/film_grain.c | 65 | ||||
-rw-r--r-- | src/shaders/film_grain.h | 75 | ||||
-rw-r--r-- | src/shaders/film_grain_av1.c | 1001 | ||||
-rw-r--r-- | src/shaders/film_grain_h274.c | 815 | ||||
-rw-r--r-- | src/shaders/icc.c | 781 | ||||
-rw-r--r-- | src/shaders/lut.c | 820 | ||||
-rw-r--r-- | src/shaders/meson.build | 23 | ||||
-rw-r--r-- | src/shaders/sampling.c | 1198 |
15 files changed, 10921 insertions, 0 deletions
diff --git a/src/shaders.c b/src/shaders.c new file mode 100644 index 0000000..503ea78 --- /dev/null +++ b/src/shaders.c @@ -0,0 +1,992 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <math.h> + +#include "common.h" +#include "log.h" +#include "shaders.h" + +pl_shader_info pl_shader_info_ref(pl_shader_info pinfo) +{ + struct sh_info *info = (struct sh_info *) pinfo; + if (!info) + return NULL; + + pl_rc_ref(&info->rc); + return &info->info; +} + +void pl_shader_info_deref(pl_shader_info *pinfo) +{ + struct sh_info *info = (struct sh_info *) *pinfo; + if (!info) + return; + + if (pl_rc_deref(&info->rc)) + pl_free(info); + *pinfo = NULL; +} + +static struct sh_info *sh_info_alloc(void *alloc) +{ + struct sh_info *info = pl_zalloc_ptr(alloc, info); + info->tmp = pl_tmp(info); + pl_rc_init(&info->rc); + return info; +} + +// Re-use `sh_info` allocation if possible, allocate new otherwise +static struct sh_info *sh_info_recycle(struct sh_info *info) +{ + if (!pl_rc_deref(&info->rc)) + return sh_info_alloc(NULL); + + memset(&info->info, 0, sizeof(info->info)); // reset public fields + pl_free_children(info->tmp); + pl_rc_ref(&info->rc); + info->desc.len = 0; + info->steps.num = 0; + return info; +} + +static uint8_t reverse_bits(uint8_t x) +{ + static const uint8_t reverse_nibble[16] = { + 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe, + 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf, + }; + + return reverse_nibble[x & 0xF] << 4 | reverse_nibble[x >> 4]; +} + +static void init_shader(pl_shader sh, const struct pl_shader_params *params) +{ + if (params) { + sh->info->info.params = *params; + + // To avoid collisions for shaders with very high number of + // identifiers, pack the shader ID into the highest bits (MSB -> LSB) + pl_static_assert(sizeof(sh->prefix) > sizeof(params->id)); + const int shift = 8 * (sizeof(sh->prefix) - sizeof(params->id)); + sh->prefix = reverse_bits(params->id) << shift; + } + + sh->name = sh_fresh(sh, "main"); +} + +pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params) +{ + static const int glsl_ver_req = 130; + if (params && params->glsl.version && params->glsl.version < 130) { + pl_err(log, "Requested GLSL version %d too low (required: %d)", + params->glsl.version, glsl_ver_req); + return NULL; + } + + pl_shader sh = pl_alloc_ptr(NULL, sh); + *sh = (struct pl_shader_t) { + .log = log, + .tmp = pl_tmp(sh), + .info = sh_info_alloc(NULL), + .mutable = true, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(sh->buffers); i++) + sh->buffers[i] = pl_str_builder_alloc(sh); + + init_shader(sh, params); + return sh; +} + +static void sh_obj_deref(pl_shader_obj obj); + +void sh_deref(pl_shader sh) +{ + pl_free_children(sh->tmp); + + for (int i = 0; i < sh->obj.num; i++) + sh_obj_deref(sh->obj.elem[i]); + sh->obj.num = 0; +} + +void pl_shader_free(pl_shader *psh) +{ + pl_shader sh = *psh; + if (!sh) + return; + + sh_deref(sh); + pl_shader_info_deref((pl_shader_info *) &sh->info); + pl_free_ptr(psh); +} + +void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params) +{ + sh_deref(sh); + + struct pl_shader_t new = { + .log = sh->log, + .tmp = sh->tmp, + .info = sh_info_recycle(sh->info), + .data.buf = sh->data.buf, + .mutable = true, + + // Preserve array allocations + .obj.elem = sh->obj.elem, + .vas.elem = sh->vas.elem, + .vars.elem = sh->vars.elem, + .descs.elem = sh->descs.elem, + .consts.elem = sh->consts.elem, + }; + + // Preserve buffer allocations + memcpy(new.buffers, sh->buffers, sizeof(new.buffers)); + for (int i = 0; i < PL_ARRAY_SIZE(new.buffers); i++) + pl_str_builder_reset(new.buffers[i]); + + *sh = new; + init_shader(sh, params); +} + +static void *sh_alloc(pl_shader sh, size_t size, size_t align) +{ + const size_t offset = PL_ALIGN2(sh->data.len, align); + const size_t req_size = offset + size; + if (req_size <= pl_get_size(sh->data.buf)) { + sh->data.len = offset + size; + return sh->data.buf + offset; + } + + // We can't realloc this buffer because various pointers will be left + // dangling, so just reparent it onto `sh->tmp` (so it will be cleaned + // up when the shader is next reset) and allocate a new, larger buffer + // in its place + const size_t new_size = PL_MAX(req_size << 1, 256); + pl_steal(sh->tmp, sh->data.buf); + sh->data.buf = pl_alloc(sh, new_size); + sh->data.len = size; + return sh->data.buf; +} + +static void *sh_memdup(pl_shader sh, const void *data, size_t size, size_t align) +{ + if (!size) + return NULL; + + void *dst = sh_alloc(sh, size, align); + assert(data); + memcpy(dst, data, size); + return dst; +} + +bool pl_shader_is_failed(const pl_shader sh) +{ + return sh->failed; +} + +struct pl_glsl_version sh_glsl(const pl_shader sh) +{ + if (SH_PARAMS(sh).glsl.version) + return SH_PARAMS(sh).glsl; + + if (SH_GPU(sh)) + return SH_GPU(sh)->glsl; + + return (struct pl_glsl_version) { .version = 130 }; +} + +bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem) +{ + pl_assert(bw && bh); + int *sh_bw = &sh->group_size[0]; + int *sh_bh = &sh->group_size[1]; + + struct pl_glsl_version glsl = sh_glsl(sh); + if (!glsl.compute) { + PL_TRACE(sh, "Disabling compute shader due to missing `compute` support"); + return false; + } + + if (sh->shmem + mem > glsl.max_shmem_size) { + PL_TRACE(sh, "Disabling compute shader due to insufficient shmem"); + return false; + } + + if (sh->type == SH_FRAGMENT) { + PL_TRACE(sh, "Disabling compute shader because shader is already marked " + "as fragment shader"); + return false; + } + + if (bw > glsl.max_group_size[0] || + bh > glsl.max_group_size[1] || + (bw * bh) > glsl.max_group_threads) + { + if (!flex) { + PL_TRACE(sh, "Disabling compute shader due to exceeded group " + "thread count."); + return false; + } else { + // Pick better group sizes + bw = PL_MIN(bw, glsl.max_group_size[0]); + bh = glsl.max_group_threads / bw; + } + } + + sh->shmem += mem; + + // If the current shader is either not a compute shader, or we have no + // choice but to override the metadata, always do so + if (sh->type != SH_COMPUTE || (sh->flexible_work_groups && !flex)) { + *sh_bw = bw; + *sh_bh = bh; + sh->type = SH_COMPUTE; + sh->flexible_work_groups = flex; + return true; + } + + // If both shaders are flexible, pick the larger of the two + if (sh->flexible_work_groups && flex) { + *sh_bw = PL_MAX(*sh_bw, bw); + *sh_bh = PL_MAX(*sh_bh, bh); + pl_assert(*sh_bw * *sh_bh <= glsl.max_group_threads); + return true; + } + + // At this point we're looking only at a non-flexible compute shader + pl_assert(sh->type == SH_COMPUTE && !sh->flexible_work_groups); + if (!flex) { + // Ensure parameters match + if (bw != *sh_bw || bh != *sh_bh) { + PL_TRACE(sh, "Disabling compute shader due to incompatible group " + "sizes %dx%d and %dx%d", *sh_bw, *sh_bh, bw, bh); + sh->shmem -= mem; + return false; + } + } + + return true; +} + +bool pl_shader_is_compute(const pl_shader sh) +{ + return sh->type == SH_COMPUTE; +} + +bool pl_shader_output_size(const pl_shader sh, int *w, int *h) +{ + if (!sh->output_w || !sh->output_h) + return false; + + *w = sh->transpose ? sh->output_h : sh->output_w; + *h = sh->transpose ? sh->output_w : sh->output_h; + return true; +} + +ident_t sh_fresh(pl_shader sh, const char *name) +{ + unsigned short id = ++sh->fresh; + assert(!(sh->prefix & id)); + id |= sh->prefix; + + assert(name); + return sh_mkident(id, name); +} + +static inline ident_t sh_fresh_name(pl_shader sh, const char **pname) +{ + ident_t id = sh_fresh(sh, *pname); + *pname = sh_ident_pack(id); + return id; +} + +ident_t sh_var(pl_shader sh, struct pl_shader_var sv) +{ + ident_t id = sh_fresh_name(sh, &sv.var.name); + struct pl_var_layout layout = pl_var_host_layout(0, &sv.var); + sv.data = sh_memdup(sh, sv.data, layout.size, layout.stride); + PL_ARRAY_APPEND(sh, sh->vars, sv); + return id; +} + +ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_int(name), + .data = &val, + .dynamic = dynamic, + }); +} + +ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_uint(name), + .data = &val, + .dynamic = dynamic, + }); +} + +ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float(name), + .data = &val, + .dynamic = dynamic, + }); +} + +ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val) +{ + return sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3(name), + .data = PL_TRANSPOSE_3X3(val.m), + }); +} + +ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd) +{ + switch (sd.desc.type) { + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: + for (int i = 0; i < sh->descs.num; i++) // ensure uniqueness + pl_assert(sh->descs.elem[i].binding.object != sd.binding.object); + size_t bsize = sizeof(sd.buffer_vars[0]) * sd.num_buffer_vars; + sd.buffer_vars = sh_memdup(sh, sd.buffer_vars, bsize, + alignof(struct pl_buffer_var)); + for (int i = 0; i < sd.num_buffer_vars; i++) { + struct pl_var *bv = &sd.buffer_vars[i].var; + const char *name = bv->name; + GLSLP("#define %s "$"\n", name, sh_fresh_name(sh, &bv->name)); + } + break; + + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: + pl_assert(!sd.num_buffer_vars); + break; + + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + + ident_t id = sh_fresh_name(sh, &sd.desc.name); + PL_ARRAY_APPEND(sh, sh->descs, sd); + return id; +} + +ident_t sh_const(pl_shader sh, struct pl_shader_const sc) +{ + if (SH_PARAMS(sh).dynamic_constants && !sc.compile_time) { + return sh_var(sh, (struct pl_shader_var) { + .var = { + .name = sc.name, + .type = sc.type, + .dim_v = 1, + .dim_m = 1, + .dim_a = 1, + }, + .data = sc.data, + }); + } + + ident_t id = sh_fresh_name(sh, &sc.name); + + pl_gpu gpu = SH_GPU(sh); + if (gpu && gpu->limits.max_constants) { + if (!sc.compile_time || gpu->limits.array_size_constants) { + size_t size = pl_var_type_size(sc.type); + sc.data = sh_memdup(sh, sc.data, size, size); + PL_ARRAY_APPEND(sh, sh->consts, sc); + return id; + } + } + + // Fallback for GPUs without specialization constants + switch (sc.type) { + case PL_VAR_SINT: + GLSLH("const int "$" = %d; \n", id, *(int *) sc.data); + return id; + case PL_VAR_UINT: + GLSLH("const uint "$" = uint(%u); \n", id, *(unsigned int *) sc.data); + return id; + case PL_VAR_FLOAT: + GLSLH("const float "$" = float(%f); \n", id, *(float *) sc.data); + return id; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +ident_t sh_const_int(pl_shader sh, const char *name, int val) +{ + return sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_SINT, + .name = name, + .data = &val, + }); +} + +ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val) +{ + return sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_UINT, + .name = name, + .data = &val, + }); +} + +ident_t sh_const_float(pl_shader sh, const char *name, float val) +{ + return sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_FLOAT, + .name = name, + .data = &val, + }); +} + +ident_t sh_attr(pl_shader sh, struct pl_shader_va sva) +{ + const size_t vsize = sva.attr.fmt->texel_size; + uint8_t *data = sh_alloc(sh, vsize * 4, vsize); + for (int i = 0; i < 4; i++) { + memcpy(data, sva.data[i], vsize); + sva.data[i] = data; + data += vsize; + } + + ident_t id = sh_fresh_name(sh, &sva.attr.name); + PL_ARRAY_APPEND(sh, sh->vas, sva); + return id; +} + +ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc) +{ + pl_gpu gpu = SH_GPU(sh); + if (!gpu) { + SH_FAIL(sh, "Failed adding vertex attr '%s': No GPU available!", name); + return NULL_IDENT; + } + + pl_fmt fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2); + if (!fmt) { + SH_FAIL(sh, "Failed adding vertex attr '%s': no vertex fmt!", name); + return NULL_IDENT; + } + + float verts[4][2] = { + { rc->x0, rc->y0 }, + { rc->x1, rc->y0 }, + { rc->x0, rc->y1 }, + { rc->x1, rc->y1 }, + }; + + return sh_attr(sh, (struct pl_shader_va) { + .attr = { + .name = name, + .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2), + }, + .data = { verts[0], verts[1], verts[2], verts[3] }, + }); +} + +ident_t sh_bind(pl_shader sh, pl_tex tex, + enum pl_tex_address_mode address_mode, + enum pl_tex_sample_mode sample_mode, + const char *name, const pl_rect2df *rect, + ident_t *out_pos, ident_t *out_pt) +{ + if (pl_tex_params_dimension(tex->params) != 2) { + SH_FAIL(sh, "Failed binding texture '%s': not a 2D texture!", name); + return NULL_IDENT; + } + + if (!tex->params.sampleable) { + SH_FAIL(sh, "Failed binding texture '%s': texture not sampleable!", name); + return NULL_IDENT; + } + + ident_t itex = sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = name, + .type = PL_DESC_SAMPLED_TEX, + }, + .binding = { + .object = tex, + .address_mode = address_mode, + .sample_mode = sample_mode, + }, + }); + + float sx, sy; + if (tex->sampler_type == PL_SAMPLER_RECT) { + sx = 1.0; + sy = 1.0; + } else { + sx = 1.0 / tex->params.w; + sy = 1.0 / tex->params.h; + } + + if (out_pos) { + pl_rect2df full = { + .x1 = tex->params.w, + .y1 = tex->params.h, + }; + + rect = PL_DEF(rect, &full); + *out_pos = sh_attr_vec2(sh, "tex_coord", &(pl_rect2df) { + .x0 = sx * rect->x0, .y0 = sy * rect->y0, + .x1 = sx * rect->x1, .y1 = sy * rect->y1, + }); + } + + if (out_pt) { + *out_pt = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tex_pt"), + .data = &(float[2]) {sx, sy}, + }); + } + + return itex; +} + +bool sh_buf_desc_append(void *alloc, pl_gpu gpu, + struct pl_shader_desc *buf_desc, + struct pl_var_layout *out_layout, + const struct pl_var new_var) +{ + struct pl_buffer_var bv = { .var = new_var }; + size_t cur_size = sh_buf_desc_size(buf_desc); + + switch (buf_desc->desc.type) { + case PL_DESC_BUF_UNIFORM: + bv.layout = pl_std140_layout(cur_size, &new_var); + if (bv.layout.offset + bv.layout.size > gpu->limits.max_ubo_size) + return false; + break; + case PL_DESC_BUF_STORAGE: + bv.layout = pl_std430_layout(cur_size, &new_var); + if (bv.layout.offset + bv.layout.size > gpu->limits.max_ssbo_size) + return false; + break; + case PL_DESC_INVALID: + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + + if (out_layout) + *out_layout = bv.layout; + PL_ARRAY_APPEND_RAW(alloc, buf_desc->buffer_vars, buf_desc->num_buffer_vars, bv); + return true; +} + +size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc) +{ + if (!buf_desc->num_buffer_vars) + return 0; + + const struct pl_buffer_var *last; + last = &buf_desc->buffer_vars[buf_desc->num_buffer_vars - 1]; + return last->layout.offset + last->layout.size; +} + +void sh_describef(pl_shader sh, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + sh_describe(sh, pl_vasprintf(sh->info->tmp, fmt, ap)); + va_end(ap); +} + +static const char *insigs[] = { + [PL_SHADER_SIG_NONE] = "", + [PL_SHADER_SIG_COLOR] = "vec4 color", +}; + +static const char *outsigs[] = { + [PL_SHADER_SIG_NONE] = "void", + [PL_SHADER_SIG_COLOR] = "vec4", +}; + +static const char *retvals[] = { + [PL_SHADER_SIG_NONE] = "", + [PL_SHADER_SIG_COLOR] = "return color;", +}; + +// libplacebo currently only allows 2D samplers for shader signatures +static const char *samplers2D[] = { + [PL_SAMPLER_NORMAL] = "sampler2D", + [PL_SAMPLER_RECT] = "sampler2DRect", + [PL_SAMPLER_EXTERNAL] = "samplerExternalOES", +}; + +ident_t sh_subpass(pl_shader sh, pl_shader sub) +{ + pl_assert(sh->mutable); + + if (sh->prefix == sub->prefix) { + PL_TRACE(sh, "Can't merge shaders: conflicting identifiers!"); + return NULL_IDENT; + } + + // Check for shader compatibility + int res_w = PL_DEF(sh->output_w, sub->output_w), + res_h = PL_DEF(sh->output_h, sub->output_h); + + if ((sub->output_w && res_w != sub->output_w) || + (sub->output_h && res_h != sub->output_h)) + { + PL_TRACE(sh, "Can't merge shaders: incompatible sizes: %dx%d and %dx%d", + sh->output_w, sh->output_h, sub->output_w, sub->output_h); + return NULL_IDENT; + } + + if (sub->type == SH_COMPUTE) { + int subw = sub->group_size[0], + subh = sub->group_size[1]; + bool flex = sub->flexible_work_groups; + + if (!sh_try_compute(sh, subw, subh, flex, sub->shmem)) { + PL_TRACE(sh, "Can't merge shaders: incompatible block sizes or " + "exceeded shared memory resource capabilities"); + return NULL_IDENT; + } + } + + sh->output_w = res_w; + sh->output_h = res_h; + + // Append the prelude and header + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sub->buffers[SH_BUF_PRELUDE]); + pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_HEADER]); + + // Append the body as a new header function + if (sub->input == PL_SHADER_SIG_SAMPLER) { + pl_assert(sub->sampler_prefix); + GLSLH("%s "$"(%c%s src_tex, vec2 tex_coord) {\n", + outsigs[sub->output], sub->name, + sub->sampler_prefix, samplers2D[sub->sampler_type]); + } else { + GLSLH("%s "$"(%s) {\n", + outsigs[sub->output], sub->name, insigs[sub->input]); + } + pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_BODY]); + GLSLH("%s\n}\n\n", retvals[sub->output]); + + // Steal all inputs and objects from the subpass +#define ARRAY_STEAL(arr) do \ +{ \ + PL_ARRAY_CONCAT(sh, sh->arr, sub->arr); \ + sub->arr.num = 0; \ +} while (0) + + ARRAY_STEAL(obj); + ARRAY_STEAL(vas); + ARRAY_STEAL(vars); + ARRAY_STEAL(descs); + ARRAY_STEAL(consts); +#undef ARRAY_STEAL + + // Steal the scratch buffer (if it holds data) + if (sub->data.len) { + pl_steal(sh->tmp, sub->data.buf); + sub->data = (pl_str) {0}; + } + + // Steal all temporary allocations and mark the child as unusable + pl_steal(sh->tmp, sub->tmp); + sub->tmp = pl_tmp(sub); + sub->failed = true; + + // Steal the shader steps array (and allocations) + pl_assert(pl_rc_count(&sub->info->rc) == 1); + PL_ARRAY_CONCAT(sh->info, sh->info->steps, sub->info->steps); + pl_steal(sh->info->tmp, sub->info->tmp); + sub->info->tmp = pl_tmp(sub->info); + sub->info->steps.num = 0; // sanity + + return sub->name; +} + +pl_str_builder sh_finalize_internal(pl_shader sh) +{ + pl_assert(sh->mutable); // this function should only ever be called once + if (sh->failed) + return NULL; + + // Padding for readability + GLSLP("\n"); + + // Concatenate everything onto the prelude to form the final output + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_HEADER]); + + if (sh->input == PL_SHADER_SIG_SAMPLER) { + pl_assert(sh->sampler_prefix); + GLSLP("%s "$"(%c%s src_tex, vec2 tex_coord) {\n", + outsigs[sh->output], sh->name, + sh->sampler_prefix, + samplers2D[sh->sampler_type]); + } else { + GLSLP("%s "$"(%s) {\n", outsigs[sh->output], sh->name, insigs[sh->input]); + } + + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_BODY]); + pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_FOOTER]); + GLSLP("%s\n}\n\n", retvals[sh->output]); + + // Generate the shader info + struct sh_info *info = sh->info; + info->info.steps = info->steps.elem; + info->info.num_steps = info->steps.num; + info->info.description = "(unknown shader)"; + + // Generate pretty description + for (int i = 0; i < info->steps.num; i++) { + const char *step = info->steps.elem[i]; + + // Prevent duplicates. We're okay using a weak equality check here + // because most pass descriptions are static strings. + for (int j = 0; j < i; j++) { + if (info->steps.elem[j] == step) + goto next_step; + } + + int count = 1; + for (int j = i+1; j < info->steps.num; j++) { + if (info->steps.elem[j] == step) + count++; + } + + const char *prefix = i > 0 ? ", " : ""; + if (count > 1) { + pl_str_append_asprintf(info, &info->desc, "%s%s x%d", + prefix, step, count); + } else { + pl_str_append_asprintf(info, &info->desc, "%s%s", prefix, step); + } + +next_step: ; + } + + if (info->desc.len) + info->info.description = (char *) info->desc.buf; + + sh->mutable = false; + return sh->buffers[SH_BUF_PRELUDE]; +} + +const struct pl_shader_res *pl_shader_finalize(pl_shader sh) +{ + if (sh->failed) { + return NULL; + } else if (!sh->mutable) { + return &sh->result; + } + + pl_shader_info info = &sh->info->info; + pl_str_builder glsl = sh_finalize_internal(sh); + + // Turn ident_t into friendly strings before passing it to users +#define FIX_IDENT(name) \ + name = sh_ident_tostr(sh_ident_unpack(name)) + for (int i = 0; i < sh->vas.num; i++) + FIX_IDENT(sh->vas.elem[i].attr.name); + for (int i = 0; i < sh->vars.num; i++) + FIX_IDENT(sh->vars.elem[i].var.name); + for (int i = 0; i < sh->consts.num; i++) + FIX_IDENT(sh->consts.elem[i].name); + for (int i = 0; i < sh->descs.num; i++) { + struct pl_shader_desc *sd = &sh->descs.elem[i]; + FIX_IDENT(sd->desc.name); + for (int j = 0; j < sd->num_buffer_vars; sd++) + FIX_IDENT(sd->buffer_vars[j].var.name); + } +#undef FIX_IDENT + + sh->result = (struct pl_shader_res) { + .info = info, + .glsl = (char *) pl_str_builder_exec(glsl).buf, + .name = sh_ident_tostr(sh->name), + .input = sh->input, + .output = sh->output, + .compute_group_size = { sh->group_size[0], sh->group_size[1] }, + .compute_shmem = sh->shmem, + .vertex_attribs = sh->vas.elem, + .num_vertex_attribs = sh->vas.num, + .variables = sh->vars.elem, + .num_variables = sh->vars.num, + .descriptors = sh->descs.elem, + .num_descriptors = sh->descs.num, + .constants = sh->consts.elem, + .num_constants = sh->consts.num, + // deprecated fields + .params = info->params, + .steps = info->steps, + .num_steps = info->num_steps, + .description = info->description, + }; + + return &sh->result; +} + +bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h) +{ + if (sh->failed) { + SH_FAIL(sh, "Attempting to modify a failed shader!"); + return false; + } + + if (!sh->mutable) { + SH_FAIL(sh, "Attempted to modify an immutable shader!"); + return false; + } + + if ((w && sh->output_w && sh->output_w != w) || + (h && sh->output_h && sh->output_h != h)) + { + SH_FAIL(sh, "Illegal sequence of shader operations: Incompatible " + "output size requirements %dx%d and %dx%d", + sh->output_w, sh->output_h, w, h); + return false; + } + + static const char *names[] = { + [PL_SHADER_SIG_NONE] = "PL_SHADER_SIG_NONE", + [PL_SHADER_SIG_COLOR] = "PL_SHADER_SIG_COLOR", + }; + + // If we require an input, but there is none available - just get it from + // the user by turning it into an explicit input signature. + if (!sh->output && insig) { + pl_assert(!sh->input); + sh->input = insig; + } else if (sh->output != insig) { + SH_FAIL(sh, "Illegal sequence of shader operations! Current output " + "signature is '%s', but called operation expects '%s'!", + names[sh->output], names[insig]); + return false; + } + + // All of our shaders end up returning a vec4 color + sh->output = PL_SHADER_SIG_COLOR; + sh->output_w = PL_DEF(sh->output_w, w); + sh->output_h = PL_DEF(sh->output_h, h); + return true; +} + +static void sh_obj_deref(pl_shader_obj obj) +{ + if (!pl_rc_deref(&obj->rc)) + return; + + if (obj->uninit) + obj->uninit(obj->gpu, obj->priv); + + pl_free(obj); +} + +void pl_shader_obj_destroy(pl_shader_obj *ptr) +{ + pl_shader_obj obj = *ptr; + if (!obj) + return; + + sh_obj_deref(obj); + *ptr = NULL; +} + +void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr, + enum pl_shader_obj_type type, size_t priv_size, + void (*uninit)(pl_gpu gpu, void *priv)) +{ + if (!ptr) + return NULL; + + pl_shader_obj obj = *ptr; + if (obj && obj->gpu != SH_GPU(sh)) { + SH_FAIL(sh, "Passed pl_shader_obj belongs to different GPU!"); + return NULL; + } + + if (obj && obj->type != type) { + SH_FAIL(sh, "Passed pl_shader_obj of wrong type! Shader objects must " + "always be used with the same type of shader."); + return NULL; + } + + if (!obj) { + obj = pl_zalloc_ptr(NULL, obj); + pl_rc_init(&obj->rc); + obj->gpu = SH_GPU(sh); + obj->type = type; + obj->priv = pl_zalloc(obj, priv_size); + obj->uninit = uninit; + } + + PL_ARRAY_APPEND(sh, sh->obj, obj); + pl_rc_ref(&obj->rc); + + *ptr = obj; + return obj->priv; +} + +ident_t sh_prng(pl_shader sh, bool temporal, ident_t *p_state) +{ + ident_t randfun = sh_fresh(sh, "rand"), + state = sh_fresh(sh, "state"); + + // Based on pcg3d (http://jcgt.org/published/0009/03/02/) + GLSLP("#define prng_t uvec3\n"); + GLSLH("vec3 "$"(inout uvec3 s) { \n" + " s = 1664525u * s + uvec3(1013904223u); \n" + " s.x += s.y * s.z; \n" + " s.y += s.z * s.x; \n" + " s.z += s.x * s.y; \n" + " s ^= s >> 16u; \n" + " s.x += s.y * s.z; \n" + " s.y += s.z * s.x; \n" + " s.z += s.x * s.y; \n" + " return vec3(s) * 1.0/float(0xFFFFFFFFu); \n" + "} \n", + randfun); + + if (temporal) { + GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, "$"); \n", + state, SH_UINT_DYN(SH_PARAMS(sh).index)); + } else { + GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, 0.0); \n", state); + } + + if (p_state) + *p_state = state; + + ident_t res = sh_fresh(sh, "RAND"); + GLSLH("#define "$" ("$"("$"))\n", res, randfun, state); + return res; +} diff --git a/src/shaders.h b/src/shaders.h new file mode 100644 index 0000000..7656a35 --- /dev/null +++ b/src/shaders.h @@ -0,0 +1,387 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdio.h> +#include <limits.h> + +#include "common.h" +#include "cache.h" +#include "log.h" +#include "gpu.h" + +#include <libplacebo/shaders.h> + +// This represents an identifier (e.g. name of function, uniform etc.) for +// a shader resource. Not human-readable. + +typedef unsigned short ident_t; +#define $ "_%hx" +#define NULL_IDENT 0u + +#define sh_mkident(id, name) ((ident_t) id) +#define sh_ident_tostr(id) pl_asprintf(sh->tmp, $, id) + +enum { + IDENT_BITS = 8 * sizeof(ident_t), + IDENT_MASK = (uintptr_t) USHRT_MAX, + IDENT_SENTINEL = (uintptr_t) 0x20230319 << IDENT_BITS, +}; + +// Functions to pack/unpack an identifier into a `const char *` name field. +// Used to defer string templating of friendly names until actually necessary +static inline const char *sh_ident_pack(ident_t id) +{ + return (const char *)(uintptr_t) (IDENT_SENTINEL | id); +} + +static inline ident_t sh_ident_unpack(const char *name) +{ + uintptr_t uname = (uintptr_t) name; + assert((uname & ~IDENT_MASK) == IDENT_SENTINEL); + return uname & IDENT_MASK; +} + +enum pl_shader_buf { + SH_BUF_PRELUDE, // extra #defines etc. + SH_BUF_HEADER, // previous passes, helper function definitions, etc. + SH_BUF_BODY, // partial contents of the "current" function + SH_BUF_FOOTER, // will be appended to the end of the current function + SH_BUF_COUNT, +}; + +enum pl_shader_type { + SH_AUTO, + SH_COMPUTE, + SH_FRAGMENT +}; + +struct sh_info { + // public-facing struct + struct pl_shader_info_t info; + + // internal fields + void *tmp; + pl_rc_t rc; + pl_str desc; + PL_ARRAY(const char *) steps; +}; + +struct pl_shader_t { + pl_log log; + void *tmp; // temporary allocations (freed on pl_shader_reset) + struct sh_info *info; + pl_str data; // pooled/recycled scratch buffer for small allocations + PL_ARRAY(pl_shader_obj) obj; + bool failed; + bool mutable; + ident_t name; + enum pl_shader_sig input, output; + int output_w; + int output_h; + bool transpose; + pl_str_builder buffers[SH_BUF_COUNT]; + enum pl_shader_type type; + bool flexible_work_groups; + int group_size[2]; + size_t shmem; + enum pl_sampler_type sampler_type; + char sampler_prefix; + unsigned short prefix; // pre-processed version of res.params.id + unsigned short fresh; + + // Note: internally, these `pl_shader_va` etc. use raw ident_t fields + // instead of `const char *` wherever a name is required! These are + // translated to legal strings either in `pl_shader_finalize`, or inside + // the `pl_dispatch` shader compilation step. + PL_ARRAY(struct pl_shader_va) vas; + PL_ARRAY(struct pl_shader_var) vars; + PL_ARRAY(struct pl_shader_desc) descs; + PL_ARRAY(struct pl_shader_const) consts; + + // cached result of `pl_shader_finalize` + struct pl_shader_res result; +}; + +// Free temporary resources associated with a shader. Normally called by +// pl_shader_reset(), but used internally to reduce memory waste. +void sh_deref(pl_shader sh); + +// Same as `pl_shader_finalize` but doesn't generate `sh->res`, instead returns +// the string builder to be used to finalize the shader. Assumes the caller +// will access the shader's internal fields directly. +pl_str_builder sh_finalize_internal(pl_shader sh); + +// Helper functions for convenience +#define SH_PARAMS(sh) ((sh)->info->info.params) +#define SH_GPU(sh) (SH_PARAMS(sh).gpu) +#define SH_CACHE(sh) pl_gpu_cache(SH_GPU(sh)) + +// Returns the GLSL version, defaulting to desktop 130. +struct pl_glsl_version sh_glsl(const pl_shader sh); + +#define SH_FAIL(sh, ...) do { \ + sh->failed = true; \ + PL_ERR(sh, __VA_ARGS__); \ + } while (0) + +// Attempt enabling compute shaders for this pass, if possible +bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem); + +// Attempt merging a secondary shader into the current shader. Returns NULL if +// merging fails (e.g. incompatible signatures); otherwise returns an identifier +// corresponding to the generated subpass function. +// +// If successful, the subpass shader is set to an undefined failure state and +// must be explicitly reset/aborted before being re-used. +ident_t sh_subpass(pl_shader sh, pl_shader sub); + +// Helpers for adding new variables/descriptors/etc. with fresh, unique +// identifier names. These will never conflict with other identifiers, even +// if the shaders are merged together. +ident_t sh_fresh(pl_shader sh, const char *name); + +// Add a new shader var and return its identifier +ident_t sh_var(pl_shader sh, struct pl_shader_var sv); + +// Helper functions for `sh_var` +ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic); +ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic); +ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic); +ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val); +#define SH_INT_DYN(val) sh_var_int(sh, "const", val, true) +#define SH_UINT_DYN(val) sh_var_uint(sh, "const", val, true) +#define SH_FLOAT_DYN(val) sh_var_float(sh, "const", val, true) +#define SH_MAT3(val) sh_var_mat3(sh, "mat", val) + +// Add a new shader desc and return its identifier. +ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd); + +// Add a new shader constant and return its identifier. +ident_t sh_const(pl_shader sh, struct pl_shader_const sc); + +// Helper functions for `sh_const` +ident_t sh_const_int(pl_shader sh, const char *name, int val); +ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val); +ident_t sh_const_float(pl_shader sh, const char *name, float val); +#define SH_INT(val) sh_const_int(sh, "const", val) +#define SH_UINT(val) sh_const_uint(sh, "const", val) +#define SH_FLOAT(val) sh_const_float(sh, "const", val) + +// Add a new shader va and return its identifier +ident_t sh_attr(pl_shader sh, struct pl_shader_va sva); + +// Helper to add a a vec2 VA from a pl_rect2df. Returns NULL_IDENT on failure. +ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc); + +// Bind a texture under a given transformation and make its attributes +// available as well. If an output pointer for one of the attributes is left +// as NULL, that attribute will not be added. Returns NULL on failure. `rect` +// is optional, and defaults to the full texture if left as NULL. +// +// Note that for e.g. compute shaders, the vec2 out_pos might be a macro that +// expands to an expensive computation, and should be cached by the user. +ident_t sh_bind(pl_shader sh, pl_tex tex, + enum pl_tex_address_mode address_mode, + enum pl_tex_sample_mode sample_mode, + const char *name, const pl_rect2df *rect, + ident_t *out_pos, ident_t *out_pt); + +// Incrementally build up a buffer by adding new variable elements to the +// buffer, resizing buf.buffer_vars if necessary. Returns whether or not the +// variable could be successfully added (which may fail if you try exceeding +// the size limits of the buffer type). If successful, the layout is stored +// in *out_layout (may be NULL). +bool sh_buf_desc_append(void *alloc, pl_gpu gpu, + struct pl_shader_desc *buf_desc, + struct pl_var_layout *out_layout, + const struct pl_var new_var); + +size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc); + + +// Underlying function for appending text to a shader +#define sh_append(sh, buf, ...) \ + pl_str_builder_addf((sh)->buffers[buf], __VA_ARGS__) + +#define sh_append_str(sh, buf, str) \ + pl_str_builder_str((sh)->buffers[buf], str) + +#define GLSLP(...) sh_append(sh, SH_BUF_PRELUDE, __VA_ARGS__) +#define GLSLH(...) sh_append(sh, SH_BUF_HEADER, __VA_ARGS__) +#define GLSL(...) sh_append(sh, SH_BUF_BODY, __VA_ARGS__) +#define GLSLF(...) sh_append(sh, SH_BUF_FOOTER, __VA_ARGS__) + +// Attach a description to a shader +void sh_describef(pl_shader sh, const char *fmt, ...) + PL_PRINTF(2, 3); + +static inline void sh_describe(pl_shader sh, const char *desc) +{ + PL_ARRAY_APPEND(sh->info, sh->info->steps, desc); +}; + +// Requires that the share is mutable, has an output signature compatible +// with the given input signature, as well as an output size compatible with +// the given size requirements. Errors and returns false otherwise. +bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h); + +// Shader resources + +enum pl_shader_obj_type { + PL_SHADER_OBJ_INVALID = 0, + PL_SHADER_OBJ_COLOR_MAP, + PL_SHADER_OBJ_SAMPLER, + PL_SHADER_OBJ_DITHER, + PL_SHADER_OBJ_LUT, + PL_SHADER_OBJ_AV1_GRAIN, + PL_SHADER_OBJ_FILM_GRAIN, + PL_SHADER_OBJ_RESHAPE, +}; + +struct pl_shader_obj_t { + enum pl_shader_obj_type type; + pl_rc_t rc; + pl_gpu gpu; + void (*uninit)(pl_gpu gpu, void *priv); + void *priv; +}; + +// Returns (*ptr)->priv, or NULL on failure +void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr, + enum pl_shader_obj_type type, size_t priv_size, + void (*uninit)(pl_gpu gpu, void *priv)); + +#define SH_OBJ(sh, ptr, type, t, uninit) \ + ((t*) sh_require_obj(sh, ptr, type, sizeof(t), uninit)) + +// Initializes a PRNG. The resulting string will directly evaluate to a +// pseudorandom, uniformly distributed vec3 from [0.0,1.0]. Since this +// algorithm works by mutating a state variable, if the user wants to use the +// resulting PRNG inside a subfunction, they must add an extra `inout prng_t %s` +// with the contents of `state` to the signature. (Optional) +// +// If `temporal` is set, the PRNG will vary across frames. +ident_t sh_prng(pl_shader sh, bool temporal, ident_t *state); + +// Backing memory type +enum sh_lut_type { + SH_LUT_AUTO = 0, // pick whatever makes the most sense + SH_LUT_TEXTURE, // upload as texture + SH_LUT_UNIFORM, // uniform array + SH_LUT_LITERAL, // constant / literal array in shader source (fallback) +}; + +// Interpolation method +enum sh_lut_method { + SH_LUT_NONE = 0, // no interpolation, integer indices + SH_LUT_LINEAR, // linear interpolation, vecN indices in range [0,1] + SH_LUT_CUBIC, // (bi/tri)cubic interpolation + SH_LUT_TETRAHEDRAL, // tetrahedral interpolation for vec3, equivalent to + // SH_LUT_LINEAR for lower dimensions +}; + +struct sh_lut_params { + pl_shader_obj *object; + + // Type of the LUT we intend to generate. + // + // Note: If `var_type` is PL_VAR_*INT, `method` must be SH_LUT_NONE. + enum pl_var_type var_type; + enum sh_lut_type lut_type; + enum sh_lut_method method; + + // For SH_LUT_TEXTURE, this can be used to override the texture's internal + // format, in which case it takes precedence over the default for `type`. + pl_fmt fmt; + + // LUT dimensions. Unused dimensions may be left as 0. + int width; + int height; + int depth; + int comps; + + // If true, the LUT will always be regenerated, even if the dimensions have + // not changed. + bool update; + + // Alternate way of triggering shader invalidations. If the signature + // does not match the LUT's signature, it will be regenerated. + uint64_t signature; + + // If set to true, shader objects will be preserved and updated in-place + // rather than being treated as read-only. + bool dynamic; + + // If set , generated shader objects are automatically cached in this + // cache. Requires `signature` to be set (and uniquely identify the LUT). + pl_cache cache; + + // Will be called with a zero-initialized buffer whenever the data needs to + // be computed, which happens whenever the size is changed, the shader + // object is invalidated, or `update` is set to true. + // + // Note: Interpretation of `data` is according to `type` and `fmt`. + void (*fill)(void *data, const struct sh_lut_params *params); + void *priv; + + // Debug tag to track LUT source + pl_debug_tag debug_tag; +}; + +#define sh_lut_params(...) (&(struct sh_lut_params) { \ + .debug_tag = PL_DEBUG_TAG, \ + __VA_ARGS__ \ + }) + +// Makes a table of values available as a shader variable, using an a given +// method (falling back if needed). The resulting identifier can be sampled +// directly as %s(pos), where pos is a vector with the right number of +// dimensions. `pos` must be an integer vector within the bounds of the array, +// unless the method is `SH_LUT_LINEAR`, in which case it's a float vector that +// gets interpolated and clamped as needed. Returns NULL on error. +ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params); + +static inline uint8_t sh_num_comps(uint8_t mask) +{ + pl_assert((mask & 0xF) == mask); + return __builtin_popcount(mask); +} + +static inline const char *sh_float_type(uint8_t mask) +{ + switch (sh_num_comps(mask)) { + case 1: return "float"; + case 2: return "vec2"; + case 3: return "vec3"; + case 4: return "vec4"; + } + + pl_unreachable(); +} + +static inline const char *sh_swizzle(uint8_t mask) +{ + static const char * const swizzles[0x10] = { + NULL, "r", "g", "rg", "b", "rb", "gb", "rgb", + "a", "ra", "ga", "rga", "ba", "rba", "gba", "rgba", + }; + + pl_assert(mask <= PL_ARRAY_SIZE(swizzles)); + return swizzles[mask]; +} diff --git a/src/shaders/colorspace.c b/src/shaders/colorspace.c new file mode 100644 index 0000000..c7b3b5a --- /dev/null +++ b/src/shaders/colorspace.c @@ -0,0 +1,2120 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> + +#include "cache.h" +#include "shaders.h" + +#include <libplacebo/shaders/colorspace.h> + +// Common constants for SMPTE ST.2084 (PQ) +static const float PQ_M1 = 2610./4096 * 1./4, + PQ_M2 = 2523./4096 * 128, + PQ_C1 = 3424./4096, + PQ_C2 = 2413./4096 * 32, + PQ_C3 = 2392./4096 * 32; + +// Common constants for ARIB STD-B67 (HLG) +static const float HLG_A = 0.17883277, + HLG_B = 0.28466892, + HLG_C = 0.55991073, + HLG_REF = 1000.0 / PL_COLOR_SDR_WHITE; + +// Common constants for Panasonic V-Log +static const float VLOG_B = 0.00873, + VLOG_C = 0.241514, + VLOG_D = 0.598206; + +// Common constants for Sony S-Log +static const float SLOG_A = 0.432699, + SLOG_B = 0.037584, + SLOG_C = 0.616596 + 0.03, + SLOG_P = 3.538813, + SLOG_Q = 0.030001, + SLOG_K2 = 155.0 / 219.0; + +void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr, + enum pl_alpha_mode mode) +{ + if (repr->alpha == PL_ALPHA_PREMULTIPLIED && mode == PL_ALPHA_INDEPENDENT) { + GLSL("if (color.a > 1e-6) \n" + " color.rgb /= vec3(color.a); \n"); + repr->alpha = PL_ALPHA_INDEPENDENT; + } + + if (repr->alpha == PL_ALPHA_INDEPENDENT && mode == PL_ALPHA_PREMULTIPLIED) { + GLSL("color.rgb *= vec3(color.a); \n"); + repr->alpha = PL_ALPHA_PREMULTIPLIED; + } +} + +#ifdef PL_HAVE_DOVI +static inline void reshape_mmr(pl_shader sh, ident_t mmr, bool single, + int min_order, int max_order) +{ + if (single) { + GLSL("const uint mmr_idx = 0u; \n"); + } else { + GLSL("uint mmr_idx = uint(coeffs.y); \n"); + } + + assert(min_order <= max_order); + if (min_order < max_order) + GLSL("uint order = uint(coeffs.w); \n"); + + GLSL("vec4 sigX; \n" + "s = coeffs.x; \n" + "sigX.xyz = sig.xxy * sig.yzz; \n" + "sigX.w = sigX.x * sig.z; \n" + "s += dot("$"[mmr_idx + 0].xyz, sig); \n" + "s += dot("$"[mmr_idx + 1], sigX); \n", + mmr, mmr); + + if (max_order >= 2) { + if (min_order < 2) + GLSL("if (order >= 2) { \n"); + + GLSL("vec3 sig2 = sig * sig; \n" + "vec4 sigX2 = sigX * sigX; \n" + "s += dot("$"[mmr_idx + 2].xyz, sig2); \n" + "s += dot("$"[mmr_idx + 3], sigX2); \n", + mmr, mmr); + + if (max_order == 3) { + if (min_order < 3) + GLSL("if (order >= 3 { \n"); + + GLSL("s += dot("$"[mmr_idx + 4].xyz, sig2 * sig); \n" + "s += dot("$"[mmr_idx + 5], sigX2 * sigX); \n", + mmr, mmr); + + if (min_order < 3) + GLSL("} \n"); + } + + if (min_order < 2) + GLSL("} \n"); + } +} + +static inline void reshape_poly(pl_shader sh) +{ + GLSL("s = (coeffs.z * s + coeffs.y) * s + coeffs.x; \n"); +} +#endif + +void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data) +{ +#ifdef PL_HAVE_DOVI + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0) || !data) + return; + + sh_describe(sh, "reshaping"); + GLSL("// pl_shader_reshape \n" + "{ \n" + "vec3 sig; \n" + "vec4 coeffs; \n" + "float s; \n" + "sig = clamp(color.rgb, 0.0, 1.0); \n"); + + float coeffs_data[8][4]; + float mmr_packed_data[8*6][4]; + + for (int c = 0; c < 3; c++) { + const struct pl_reshape_data *comp = &data->comp[c]; + if (!comp->num_pivots) + continue; + + pl_assert(comp->num_pivots >= 2 && comp->num_pivots <= 9); + GLSL("s = sig[%d]; \n", c); + + // Prepare coefficients for GPU + bool has_poly = false, has_mmr = false, mmr_single = true; + int mmr_idx = 0, min_order = 3, max_order = 1; + memset(coeffs_data, 0, sizeof(coeffs_data)); + for (int i = 0; i < comp->num_pivots - 1; i++) { + switch (comp->method[i]) { + case 0: // polynomial + has_poly = true; + coeffs_data[i][3] = 0.0; // order=0 signals polynomial + for (int k = 0; k < 3; k++) + coeffs_data[i][k] = comp->poly_coeffs[i][k]; + break; + + case 1: + min_order = PL_MIN(min_order, comp->mmr_order[i]); + max_order = PL_MAX(max_order, comp->mmr_order[i]); + mmr_single = !has_mmr; + has_mmr = true; + coeffs_data[i][3] = (float) comp->mmr_order[i]; + coeffs_data[i][0] = comp->mmr_constant[i]; + coeffs_data[i][1] = (float) mmr_idx; + for (int j = 0; j < comp->mmr_order[i]; j++) { + // store weights per order as two packed vec4s + float *mmr = &mmr_packed_data[mmr_idx][0]; + mmr[0] = comp->mmr_coeffs[i][j][0]; + mmr[1] = comp->mmr_coeffs[i][j][1]; + mmr[2] = comp->mmr_coeffs[i][j][2]; + mmr[3] = 0.0; // unused + mmr[4] = comp->mmr_coeffs[i][j][3]; + mmr[5] = comp->mmr_coeffs[i][j][4]; + mmr[6] = comp->mmr_coeffs[i][j][5]; + mmr[7] = comp->mmr_coeffs[i][j][6]; + mmr_idx += 2; + } + break; + + default: + pl_unreachable(); + } + } + + if (comp->num_pivots > 2) { + + // Skip the (irrelevant) lower and upper bounds + float pivots_data[7]; + memcpy(pivots_data, comp->pivots + 1, + (comp->num_pivots - 2) * sizeof(pivots_data[0])); + + // Fill the remainder with a quasi-infinite sentinel pivot + for (int i = comp->num_pivots - 2; i < PL_ARRAY_SIZE(pivots_data); i++) + pivots_data[i] = 1e9f; + + ident_t pivots = sh_var(sh, (struct pl_shader_var) { + .data = pivots_data, + .var = { + .name = "pivots", + .type = PL_VAR_FLOAT, + .dim_v = 1, + .dim_m = 1, + .dim_a = PL_ARRAY_SIZE(pivots_data), + }, + }); + + ident_t coeffs = sh_var(sh, (struct pl_shader_var) { + .data = coeffs_data, + .var = { + .name = "coeffs", + .type = PL_VAR_FLOAT, + .dim_v = 4, + .dim_m = 1, + .dim_a = PL_ARRAY_SIZE(coeffs_data), + }, + }); + + // Efficiently branch into the correct set of coefficients + GLSL("#define test(i) bvec4(s >= "$"[i]) \n" + "#define coef(i) "$"[i] \n" + "coeffs = mix(mix(mix(coef(0), coef(1), test(0)), \n" + " mix(coef(2), coef(3), test(2)), \n" + " test(1)), \n" + " mix(mix(coef(4), coef(5), test(4)), \n" + " mix(coef(6), coef(7), test(6)), \n" + " test(5)), \n" + " test(3)); \n" + "#undef test \n" + "#undef coef \n", + pivots, coeffs); + + } else { + + // No need for a single pivot, just set the coeffs directly + GLSL("coeffs = "$"; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec4("coeffs"), + .data = coeffs_data, + })); + + } + + ident_t mmr = NULL_IDENT; + if (has_mmr) { + mmr = sh_var(sh, (struct pl_shader_var) { + .data = mmr_packed_data, + .var = { + .name = "mmr", + .type = PL_VAR_FLOAT, + .dim_v = 4, + .dim_m = 1, + .dim_a = mmr_idx, + }, + }); + } + + if (has_mmr && has_poly) { + GLSL("if (coeffs.w == 0.0) { \n"); + reshape_poly(sh); + GLSL("} else { \n"); + reshape_mmr(sh, mmr, mmr_single, min_order, max_order); + GLSL("} \n"); + } else if (has_poly) { + reshape_poly(sh); + } else { + assert(has_mmr); + GLSL("{ \n"); + reshape_mmr(sh, mmr, mmr_single, min_order, max_order); + GLSL("} \n"); + } + + ident_t lo = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("lo"), + .data = &comp->pivots[0], + }); + ident_t hi = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("hi"), + .data = &comp->pivots[comp->num_pivots - 1], + }); + GLSL("color[%d] = clamp(s, "$", "$"); \n", c, lo, hi); + } + + GLSL("} \n"); +#else + SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping"); +#endif +} + +void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr, + const struct pl_color_adjustment *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + sh_describe(sh, "color decoding"); + GLSL("// pl_shader_decode_color \n" + "{ \n"); + + // Do this first because the following operations are potentially nonlinear + pl_shader_set_alpha(sh, repr, PL_ALPHA_INDEPENDENT); + + if (repr->sys == PL_COLOR_SYSTEM_XYZ || + repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) + { + ident_t scale = SH_FLOAT(pl_color_repr_normalize(repr)); + GLSL("color.rgb *= vec3("$"); \n", scale); + } + + if (repr->sys == PL_COLOR_SYSTEM_XYZ) { + pl_shader_linearize(sh, &(struct pl_color_space) { + .transfer = PL_COLOR_TRC_ST428, + }); + } + + if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) + pl_shader_dovi_reshape(sh, repr->dovi); + + enum pl_color_system orig_sys = repr->sys; + pl_transform3x3 tr = pl_color_repr_decode(repr, params); + + if (memcmp(&tr, &pl_transform3x3_identity, sizeof(tr))) { + ident_t cmat = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("cmat"), + .data = PL_TRANSPOSE_3X3(tr.mat.m), + }); + + ident_t cmat_c = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec3("cmat_c"), + .data = tr.c, + }); + + GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c); + } + + switch (orig_sys) { + case PL_COLOR_SYSTEM_BT_2020_C: + // Conversion for C'rcY'cC'bc via the BT.2020 CL system: + // C'bc = (B'-Y'c) / 1.9404 | C'bc <= 0 + // = (B'-Y'c) / 1.5816 | C'bc > 0 + // + // C'rc = (R'-Y'c) / 1.7184 | C'rc <= 0 + // = (R'-Y'c) / 0.9936 | C'rc > 0 + // + // as per the BT.2020 specification, table 4. This is a non-linear + // transformation because (constant) luminance receives non-equal + // contributions from the three different channels. + GLSL("// constant luminance conversion \n" + "color.br = color.br * mix(vec2(1.5816, 0.9936), \n" + " vec2(1.9404, 1.7184), \n" + " lessThanEqual(color.br, vec2(0.0))) \n" + " + color.gg; \n"); + // Expand channels to camera-linear light. This shader currently just + // assumes everything uses the BT.2020 12-bit gamma function, since the + // difference between 10 and 12-bit is negligible for anything other + // than 12-bit content. + GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n" + " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n" + " vec3(1.0/0.45)), \n" + " lessThanEqual(vec3(0.08145), color.rgb)); \n"); + // Calculate the green channel from the expanded RYcB, and recompress to G' + // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B + GLSL("color.g = (lin.g - 0.2627*lin.r - 0.0593*lin.b)*1.0/0.6780; \n" + "color.g = mix(color.g * 4.5, \n" + " 1.0993 * pow(color.g, 0.45) - 0.0993, \n" + " 0.0181 <= color.g); \n"); + break; + + case PL_COLOR_SYSTEM_BT_2100_PQ:; + // Conversion process from the spec: + // + // 1. L'M'S' = cmat * ICtCp + // 2. LMS = linearize(L'M'S') (EOTF for PQ, inverse OETF for HLG) + // 3. RGB = lms2rgb * LMS + // + // After this we need to invert step 2 to arrive at non-linear RGB. + // (It's important we keep the transfer function conversion separate + // from the color system decoding, so we have to partially undo our + // work here even though we will end up linearizing later on anyway) + + GLSL(// PQ EOTF + "color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + // LMS matrix + "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n" + " -2.50645, 1.98360, -0.0989137, \n" + " 0.06984, -0.192271, 1.12486) * color.rgb; \n" + // PQ OETF + "color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + break; + + case PL_COLOR_SYSTEM_BT_2100_HLG: + GLSL(// HLG OETF^-1 + "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n" + " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " + vec3(%f), \n" + " lessThan(vec3(0.5), color.rgb)); \n" + // LMS matrix + "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n" + " -2.50645, 1.98360, -0.0989137, \n" + " 0.06984, -0.192271, 1.12486) * color.rgb; \n" + // HLG OETF + "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n" + " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n" + " lessThan(vec3(1.0), color.rgb)); \n", + HLG_C, HLG_A, HLG_B, + HLG_A, HLG_B, HLG_C); + break; + + case PL_COLOR_SYSTEM_DOLBYVISION:; +#ifdef PL_HAVE_DOVI + // Dolby Vision always outputs BT.2020-referred HPE LMS, so hard-code + // the inverse LMS->RGB matrix corresponding to this color space. + pl_matrix3x3 dovi_lms2rgb = {{ + { 3.06441879, -2.16597676, 0.10155818}, + {-0.65612108, 1.78554118, -0.12943749}, + { 0.01736321, -0.04725154, 1.03004253}, + }}; + + pl_matrix3x3_mul(&dovi_lms2rgb, &repr->dovi->linear); + ident_t mat = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("lms2rgb"), + .data = PL_TRANSPOSE_3X3(dovi_lms2rgb.m), + }); + + // PQ EOTF + GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1); + // LMS matrix + GLSL("color.rgb = "$" * color.rgb; \n", mat); + // PQ OETF + GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + break; +#else + SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping"); + return; +#endif + + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + case PL_COLOR_SYSTEM_XYZ: + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_YCGCO: + break; // no special post-processing needed + + case PL_COLOR_SYSTEM_COUNT: + pl_unreachable(); + } + + // Gamma adjustment. Doing this here (in non-linear light) is technically + // somewhat wrong, but this is just an aesthetic parameter and not really + // meant for colorimetric precision, so we don't care too much. + if (params && params->gamma == 0) { + // Avoid division by zero + GLSL("color.rgb = vec3(0.0); \n"); + } else if (params && params->gamma != 1) { + ident_t gamma = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("gamma"), + .data = &(float){ 1 / params->gamma }, + }); + GLSL("color.rgb = pow(max(color.rgb, vec3(0.0)), vec3("$")); \n", gamma); + } + + GLSL("}\n"); +} + +void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + sh_describe(sh, "color encoding"); + GLSL("// pl_shader_encode_color \n" + "{ \n"); + + switch (repr->sys) { + case PL_COLOR_SYSTEM_BT_2020_C: + // Expand R'G'B' to RGB + GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n" + " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n" + " vec3(1.0/0.45)), \n" + " lessThanEqual(vec3(0.08145), color.rgb)); \n"); + + // Compute Yc from RGB and compress to R'Y'cB' + GLSL("color.g = dot(vec3(0.2627, 0.6780, 0.0593), lin); \n" + "color.g = mix(color.g * 4.5, \n" + " 1.0993 * pow(color.g, 0.45) - 0.0993, \n" + " 0.0181 <= color.g); \n"); + + // Compute C'bc and C'rc into color.br + GLSL("color.br = color.br - color.gg; \n" + "color.br *= mix(vec2(1.0/1.5816, 1.0/0.9936), \n" + " vec2(1.0/1.9404, 1.0/1.7184), \n" + " lessThanEqual(color.br, vec2(0.0))); \n"); + break; + + case PL_COLOR_SYSTEM_BT_2100_PQ:; + GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n" + " 0.523925, 0.720459, 0.075440, \n" + " 0.063965, 0.112793, 0.900394) * color.rgb; \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + break; + + case PL_COLOR_SYSTEM_BT_2100_HLG: + GLSL("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n" + " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " + vec3(%f), \n" + " lessThan(vec3(0.5), color.rgb)); \n" + "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n" + " 0.523925, 0.720459, 0.075440, \n" + " 0.063965, 0.112793, 0.900394) * color.rgb; \n" + "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n" + " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n" + " lessThan(vec3(1.0), color.rgb)); \n", + HLG_C, HLG_A, HLG_B, + HLG_A, HLG_B, HLG_C); + break; + + case PL_COLOR_SYSTEM_DOLBYVISION: + SH_FAIL(sh, "Cannot un-apply dolbyvision yet (no inverse reshaping)!"); + return; + + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + case PL_COLOR_SYSTEM_XYZ: + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_YCGCO: + break; // no special pre-processing needed + + case PL_COLOR_SYSTEM_COUNT: + pl_unreachable(); + } + + // Since this is a relatively rare operation, bypass it as much as possible + bool skip = true; + skip &= PL_DEF(repr->sys, PL_COLOR_SYSTEM_RGB) == PL_COLOR_SYSTEM_RGB; + skip &= PL_DEF(repr->levels, PL_COLOR_LEVELS_FULL) == PL_COLOR_LEVELS_FULL; + skip &= !repr->bits.sample_depth || !repr->bits.color_depth || + repr->bits.sample_depth == repr->bits.color_depth; + skip &= !repr->bits.bit_shift; + + if (!skip) { + struct pl_color_repr copy = *repr; + ident_t xyzscale = NULL_IDENT; + if (repr->sys == PL_COLOR_SYSTEM_XYZ) + xyzscale = SH_FLOAT(1.0 / pl_color_repr_normalize(©)); + + pl_transform3x3 tr = pl_color_repr_decode(©, NULL); + pl_transform3x3_invert(&tr); + + ident_t cmat = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("cmat"), + .data = PL_TRANSPOSE_3X3(tr.mat.m), + }); + + ident_t cmat_c = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec3("cmat_c"), + .data = tr.c, + }); + + GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c); + + if (repr->sys == PL_COLOR_SYSTEM_XYZ) { + pl_shader_delinearize(sh, &(struct pl_color_space) { + .transfer = PL_COLOR_TRC_ST428, + }); + GLSL("color.rgb *= vec3("$"); \n", xyzscale); + } + } + + if (repr->alpha == PL_ALPHA_PREMULTIPLIED) + GLSL("color.rgb *= vec3(color.a); \n"); + + GLSL("}\n"); +} + +static ident_t sh_luma_coeffs(pl_shader sh, const struct pl_color_space *csp) +{ + pl_matrix3x3 rgb2xyz; + rgb2xyz = pl_get_rgb2xyz_matrix(pl_raw_primaries_get(csp->primaries)); + + // FIXME: Cannot use `const vec3` due to glslang bug #2025 + ident_t coeffs = sh_fresh(sh, "luma_coeffs"); + GLSLH("#define "$" vec3("$", "$", "$") \n", coeffs, + SH_FLOAT(rgb2xyz.m[1][0]), // RGB->Y vector + SH_FLOAT(rgb2xyz.m[1][1]), + SH_FLOAT(rgb2xyz.m[1][2])); + return coeffs; +} + +void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + if (csp->transfer == PL_COLOR_TRC_LINEAR) + return; + + float csp_min, csp_max; + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = csp, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_NORM, + .out_min = &csp_min, + .out_max = &csp_max, + )); + + // Note that this clamp may technically violate the definition of + // ITU-R BT.2100, which allows for sub-blacks and super-whites to be + // displayed on the display where such would be possible. That said, the + // problem is that not all gamma curves are well-defined on the values + // outside this range, so we ignore it and just clamp anyway for sanity. + GLSL("// pl_shader_linearize \n" + "color.rgb = max(color.rgb, 0.0); \n"); + + switch (csp->transfer) { + case PL_COLOR_TRC_SRGB: + GLSL("color.rgb = mix(color.rgb * vec3(1.0/12.92), \n" + " pow((color.rgb + vec3(0.055))/vec3(1.055), \n" + " vec3(2.4)), \n" + " lessThan(vec3(0.04045), color.rgb)); \n"); + goto scale_out; + case PL_COLOR_TRC_BT_1886: { + const float lb = powf(csp_min, 1/2.4f); + const float lw = powf(csp_max, 1/2.4f); + const float a = powf(lw - lb, 2.4f); + const float b = lb / (lw - lb); + GLSL("color.rgb = "$" * pow(color.rgb + vec3("$"), vec3(2.4)); \n", + SH_FLOAT(a), SH_FLOAT(b)); + return; + } + case PL_COLOR_TRC_GAMMA18: + GLSL("color.rgb = pow(color.rgb, vec3(1.8));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA20: + GLSL("color.rgb = pow(color.rgb, vec3(2.0));\n"); + goto scale_out; + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_GAMMA22: + GLSL("color.rgb = pow(color.rgb, vec3(2.2));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA24: + GLSL("color.rgb = pow(color.rgb, vec3(2.4));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA26: + GLSL("color.rgb = pow(color.rgb, vec3(2.6));\n"); + goto scale_out; + case PL_COLOR_TRC_GAMMA28: + GLSL("color.rgb = pow(color.rgb, vec3(2.8));\n"); + goto scale_out; + case PL_COLOR_TRC_PRO_PHOTO: + GLSL("color.rgb = mix(color.rgb * vec3(1.0/16.0), \n" + " pow(color.rgb, vec3(1.8)), \n" + " lessThan(vec3(0.03125), color.rgb)); \n"); + goto scale_out; + case PL_COLOR_TRC_ST428: + GLSL("color.rgb = vec3(52.37/48.0) * pow(color.rgb, vec3(2.6));\n"); + goto scale_out; + case PL_COLOR_TRC_PQ: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + "color.rgb = max(color.rgb - vec3(%f), 0.0) \n" + " / (vec3(%f) - vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n" + // PQ's output range is 0-10000, but we need it to be relative to + // to PL_COLOR_SDR_WHITE instead, so rescale + "color.rgb *= vec3(%f); \n", + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, 10000.0 / PL_COLOR_SDR_WHITE); + return; + case PL_COLOR_TRC_HLG: { + const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1); + const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y)); + // OETF^-1 + GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n" + "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n" + " exp((color.rgb - vec3(%f)) * vec3(1.0/%f))\n" + " + vec3(%f), \n" + " lessThan(vec3(0.5), color.rgb)); \n", + SH_FLOAT(1 - b), SH_FLOAT(b), + HLG_C, HLG_A, HLG_B); + // OOTF + GLSL("color.rgb *= 1.0 / 12.0; \n" + "color.rgb *= "$" * pow(max(dot("$", color.rgb), 0.0), "$"); \n", + SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT(y - 1)); + return; + } + case PL_COLOR_TRC_V_LOG: + GLSL("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n" + " pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " - vec3(%f), \n" + " lessThanEqual(vec3(0.181), color.rgb)); \n", + VLOG_D, VLOG_C, VLOG_B); + return; + case PL_COLOR_TRC_S_LOG1: + GLSL("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " - vec3(%f); \n", + SLOG_C, SLOG_A, SLOG_B); + return; + case PL_COLOR_TRC_S_LOG2: + GLSL("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f), \n" + " (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n" + " - vec3(%f)) * vec3(1.0/%f), \n" + " lessThanEqual(vec3(%f), color.rgb)); \n", + SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q); + return; + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_COUNT: + break; + } + + pl_unreachable(); + +scale_out: + if (csp_max != 1 || csp_min != 0) { + GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n", + SH_FLOAT(csp_max - csp_min), SH_FLOAT(csp_min)); + } +} + +void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + if (csp->transfer == PL_COLOR_TRC_LINEAR) + return; + + float csp_min, csp_max; + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = csp, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_NORM, + .out_min = &csp_min, + .out_max = &csp_max, + )); + + GLSL("// pl_shader_delinearize \n"); + switch (csp->transfer) { + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_GAMMA18: + case PL_COLOR_TRC_GAMMA20: + case PL_COLOR_TRC_GAMMA22: + case PL_COLOR_TRC_GAMMA24: + case PL_COLOR_TRC_GAMMA26: + case PL_COLOR_TRC_GAMMA28: + case PL_COLOR_TRC_PRO_PHOTO: + case PL_COLOR_TRC_ST428: ; + if (csp_max != 1 || csp_min != 0) { + GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n", + SH_FLOAT(1 / (csp_max - csp_min)), + SH_FLOAT(-csp_min / (csp_max - csp_min))); + } + break; + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_PQ: + case PL_COLOR_TRC_HLG: + case PL_COLOR_TRC_V_LOG: + case PL_COLOR_TRC_S_LOG1: + case PL_COLOR_TRC_S_LOG2: + break; // scene-referred or absolute scale + case PL_COLOR_TRC_COUNT: + pl_unreachable(); + } + + GLSL("color.rgb = max(color.rgb, 0.0); \n"); + + switch (csp->transfer) { + case PL_COLOR_TRC_SRGB: + GLSL("color.rgb = mix(color.rgb * vec3(12.92), \n" + " vec3(1.055) * pow(color.rgb, vec3(1.0/2.4)) \n" + " - vec3(0.055), \n" + " lessThanEqual(vec3(0.0031308), color.rgb)); \n"); + return; + case PL_COLOR_TRC_BT_1886: { + const float lb = powf(csp_min, 1/2.4f); + const float lw = powf(csp_max, 1/2.4f); + const float a = powf(lw - lb, 2.4f); + const float b = lb / (lw - lb); + GLSL("color.rgb = pow("$" * color.rgb, vec3(1.0/2.4)) - vec3("$"); \n", + SH_FLOAT(1.0 / a), SH_FLOAT(b)); + return; + } + case PL_COLOR_TRC_GAMMA18: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/1.8));\n"); + return; + case PL_COLOR_TRC_GAMMA20: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.0));\n"); + return; + case PL_COLOR_TRC_UNKNOWN: + case PL_COLOR_TRC_GAMMA22: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.2));\n"); + return; + case PL_COLOR_TRC_GAMMA24: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.4));\n"); + return; + case PL_COLOR_TRC_GAMMA26: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.6));\n"); + return; + case PL_COLOR_TRC_GAMMA28: + GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.8));\n"); + return; + case PL_COLOR_TRC_ST428: + GLSL("color.rgb = pow(color.rgb * vec3(48.0/52.37), vec3(1.0/2.6));\n"); + return; + case PL_COLOR_TRC_PRO_PHOTO: + GLSL("color.rgb = mix(color.rgb * vec3(16.0), \n" + " pow(color.rgb, vec3(1.0/1.8)), \n" + " lessThanEqual(vec3(0.001953), color.rgb)); \n"); + return; + case PL_COLOR_TRC_PQ: + GLSL("color.rgb *= vec3(1.0/%f); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n" + "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n" + " / (vec3(1.0) + vec3(%f) * color.rgb); \n" + "color.rgb = pow(color.rgb, vec3(%f)); \n", + 10000 / PL_COLOR_SDR_WHITE, PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2); + return; + case PL_COLOR_TRC_HLG: { + const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1); + const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y)); + // OOTF^-1 + GLSL("color.rgb *= 1.0 / "$"; \n" + "color.rgb *= 12.0 * max(1e-6, pow(dot("$", color.rgb), "$")); \n", + SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT((1 - y) / y)); + // OETF + GLSL("color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n" + " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n" + " lessThan(vec3(1.0), color.rgb)); \n" + "color.rgb = "$" * color.rgb + vec3("$"); \n", + HLG_A, HLG_B, HLG_C, + SH_FLOAT(1 / (1 - b)), SH_FLOAT(-b / (1 - b))); + return; + } + case PL_COLOR_TRC_V_LOG: + GLSL("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125), \n" + " vec3(%f) * log(color.rgb + vec3(%f)) \n" + " + vec3(%f), \n" + " lessThanEqual(vec3(0.01), color.rgb)); \n", + VLOG_C / M_LN10, VLOG_B, VLOG_D); + return; + case PL_COLOR_TRC_S_LOG1: + GLSL("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n", + SLOG_A / M_LN10, SLOG_B, SLOG_C); + return; + case PL_COLOR_TRC_S_LOG2: + GLSL("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f), \n" + " vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n" + " + vec3(%f), \n" + " lessThanEqual(vec3(0.0), color.rgb)); \n", + SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C); + return; + case PL_COLOR_TRC_LINEAR: + case PL_COLOR_TRC_COUNT: + break; + } + + pl_unreachable(); +} + +const struct pl_sigmoid_params pl_sigmoid_default_params = { PL_SIGMOID_DEFAULTS }; + +void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + params = PL_DEF(params, &pl_sigmoid_default_params); + float center = PL_DEF(params->center, pl_sigmoid_default_params.center); + float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope); + + // This function needs to go through (0,0) and (1,1), so we compute the + // values at 1 and 0, and then scale/shift them, respectively. + float offset = 1.0 / (1 + expf(slope * center)); + float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset; + + GLSL("// pl_shader_sigmoidize \n" + "color = clamp(color, 0.0, 1.0); \n" + "color = vec4("$") - vec4("$") * \n" + " log(vec4(1.0) / (color * vec4("$") + vec4("$")) \n" + " - vec4(1.0)); \n", + SH_FLOAT(center), SH_FLOAT(1.0 / slope), + SH_FLOAT(scale), SH_FLOAT(offset)); +} + +void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + // See: pl_shader_sigmoidize + params = PL_DEF(params, &pl_sigmoid_default_params); + float center = PL_DEF(params->center, pl_sigmoid_default_params.center); + float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope); + float offset = 1.0 / (1 + expf(slope * center)); + float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset; + + GLSL("// pl_shader_unsigmoidize \n" + "color = clamp(color, 0.0, 1.0); \n" + "color = vec4("$") / \n" + " (vec4(1.0) + exp(vec4("$") * (vec4("$") - color))) \n" + " - vec4("$"); \n", + SH_FLOAT(1.0 / scale), + SH_FLOAT(slope), SH_FLOAT(center), + SH_FLOAT(offset / scale)); +} + +const struct pl_peak_detect_params pl_peak_detect_default_params = { PL_PEAK_DETECT_DEFAULTS }; +const struct pl_peak_detect_params pl_peak_detect_high_quality_params = { PL_PEAK_DETECT_HQ_DEFAULTS }; + +static bool peak_detect_params_eq(const struct pl_peak_detect_params *a, + const struct pl_peak_detect_params *b) +{ + return a->smoothing_period == b->smoothing_period && + a->scene_threshold_low == b->scene_threshold_low && + a->scene_threshold_high == b->scene_threshold_high && + a->percentile == b->percentile; + // don't compare `allow_delayed` because it doesn't change measurement +} + +enum { + // Split the peak buffer into several independent slices to reduce pressure + // on global atomics + SLICES = 12, + + // How many bits to use for storing PQ data. Be careful when setting this + // too high, as it may overflow `unsigned int` on large video sources. + // + // The value chosen is enough to guarantee no overflow for an 8K x 4K frame + // consisting entirely of 100% 10k nits PQ values, with 16x16 workgroups. + PQ_BITS = 14, + PQ_MAX = (1 << PQ_BITS) - 1, + + // How many bits to use for the histogram. We bias the histogram down + // by half the PQ range (~90 nits), effectively clumping the SDR part + // of the image into a single histogram bin. + HIST_BITS = 7, + HIST_BIAS = 1 << (HIST_BITS - 1), + HIST_BINS = (1 << HIST_BITS) - HIST_BIAS, + + // Convert from histogram bin to (starting) PQ value +#define HIST_PQ(bin) (((bin) + HIST_BIAS) << (PQ_BITS - HIST_BITS)) +}; + + +pl_static_assert(PQ_BITS >= HIST_BITS); + +struct peak_buf_data { + unsigned frame_wg_count[SLICES]; // number of work groups processed + unsigned frame_wg_active[SLICES];// number of active (nonzero) work groups + unsigned frame_sum_pq[SLICES]; // sum of PQ Y values over all WGs (PQ_BITS) + unsigned frame_max_pq[SLICES]; // maximum PQ Y value among these WGs (PQ_BITS) + unsigned frame_hist[SLICES][HIST_BINS]; // always allocated, conditionally used +}; + +static const struct pl_buffer_var peak_buf_vars[] = { +#define VAR(field) { \ + .var = { \ + .name = #field, \ + .type = PL_VAR_UINT, \ + .dim_v = 1, \ + .dim_m = 1, \ + .dim_a = sizeof(((struct peak_buf_data *) NULL)->field) / \ + sizeof(unsigned), \ + }, \ + .layout = { \ + .offset = offsetof(struct peak_buf_data, field), \ + .size = sizeof(((struct peak_buf_data *) NULL)->field), \ + .stride = sizeof(unsigned), \ + }, \ +} + VAR(frame_wg_count), + VAR(frame_wg_active), + VAR(frame_sum_pq), + VAR(frame_max_pq), + VAR(frame_hist), +#undef VAR +}; + +struct sh_color_map_obj { + // Tone map state + struct { + struct pl_tone_map_params params; + pl_shader_obj lut; + } tone; + + // Gamut map state + struct { + pl_shader_obj lut; + } gamut; + + // Peak detection state + struct { + struct pl_peak_detect_params params; // currently active parameters + pl_buf buf; // pending peak detection buffer + pl_buf readback; // readback buffer (fallback) + float avg_pq; // current (smoothed) values + float max_pq; + } peak; +}; + +// Excluding size, since this is checked by sh_lut +static uint64_t gamut_map_signature(const struct pl_gamut_map_params *par) +{ + uint64_t sig = CACHE_KEY_GAMUT_LUT; + pl_hash_merge(&sig, pl_str0_hash(par->function->name)); + pl_hash_merge(&sig, pl_var_hash(par->input_gamut)); + pl_hash_merge(&sig, pl_var_hash(par->output_gamut)); + pl_hash_merge(&sig, pl_var_hash(par->min_luma)); + pl_hash_merge(&sig, pl_var_hash(par->max_luma)); + pl_hash_merge(&sig, pl_var_hash(par->constants)); + return sig; +} + +static void sh_color_map_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_color_map_obj *obj = ptr; + pl_shader_obj_destroy(&obj->tone.lut); + pl_shader_obj_destroy(&obj->gamut.lut); + pl_buf_destroy(gpu, &obj->peak.buf); + pl_buf_destroy(gpu, &obj->peak.readback); + memset(obj, 0, sizeof(*obj)); +} + +static inline float iir_coeff(float rate) +{ + if (!rate) + return 1.0f; + return 1.0f - expf(-1.0f / rate); +} + +static float measure_peak(const struct peak_buf_data *data, float percentile) +{ + unsigned frame_max_pq = data->frame_max_pq[0]; + for (int k = 1; k < SLICES; k++) + frame_max_pq = PL_MAX(frame_max_pq, data->frame_max_pq[k]); + const float frame_max = (float) frame_max_pq / PQ_MAX; + if (percentile <= 0 || percentile >= 100) + return frame_max; + unsigned total_pixels = 0; + for (int k = 0; k < SLICES; k++) { + for (int i = 0; i < HIST_BINS; i++) + total_pixels += data->frame_hist[k][i]; + } + if (!total_pixels) // no histogram data available? + return frame_max; + + const unsigned target_pixel = ceilf(percentile / 100.0f * total_pixels); + if (target_pixel >= total_pixels) + return frame_max; + + unsigned sum = 0; + for (int i = 0; i < HIST_BINS; i++) { + unsigned next = sum; + for (int k = 0; k < SLICES; k++) + next += data->frame_hist[k][i]; + if (next < target_pixel) { + sum = next; + continue; + } + + // Upper and lower frequency boundaries of the matching histogram bin + const unsigned count_low = sum; // last pixel of previous bin + const unsigned count_high = next + 1; // first pixel of next bin + pl_assert(count_low < target_pixel && target_pixel < count_high); + + // PQ luminance associated with count_low/high respectively + const float pq_low = (float) HIST_PQ(i) / PQ_MAX; + float pq_high = (float) HIST_PQ(i + 1) / PQ_MAX; + if (count_high > total_pixels) // special case for last histogram bin + pq_high = frame_max; + + // Position of `target_pixel` inside this bin, assumes pixels are + // equidistributed inside a histogram bin + const float ratio = (float) (target_pixel - count_low) / + (count_high - count_low); + return PL_MIX(pq_low, pq_high, ratio); + } + + pl_unreachable(); +} + +// if `force` is true, ensures the buffer is read, even if `allow_delayed` +static void update_peak_buf(pl_gpu gpu, struct sh_color_map_obj *obj, bool force) +{ + const struct pl_peak_detect_params *params = &obj->peak.params; + if (!obj->peak.buf) + return; + + if (!force && params->allow_delayed && pl_buf_poll(gpu, obj->peak.buf, 0)) + return; // buffer not ready yet + + bool ok; + struct peak_buf_data data = {0}; + if (obj->peak.readback) { + pl_buf_copy(gpu, obj->peak.readback, 0, obj->peak.buf, 0, sizeof(data)); + ok = pl_buf_read(gpu, obj->peak.readback, 0, &data, sizeof(data)); + } else { + ok = pl_buf_read(gpu, obj->peak.buf, 0, &data, sizeof(data)); + } + if (ok && data.frame_wg_count[0] > 0) { + // Peak detection completed successfully + pl_buf_destroy(gpu, &obj->peak.buf); + } else { + // No data read? Possibly this peak obj has not been executed yet + if (!ok) { + PL_ERR(gpu, "Failed reading peak detection buffer!"); + } else if (params->allow_delayed) { + PL_TRACE(gpu, "Peak detection buffer not yet ready, ignoring.."); + } else { + PL_WARN(gpu, "Peak detection usage error: attempted detecting peak " + "and using detected peak in the same shader program, " + "but `params->allow_delayed` is false! Ignoring, but " + "expect incorrect output."); + } + if (force || !ok) + pl_buf_destroy(gpu, &obj->peak.buf); + return; + } + + uint64_t frame_sum_pq = 0u, frame_wg_count = 0u, frame_wg_active = 0u; + for (int k = 0; k < SLICES; k++) { + frame_sum_pq += data.frame_sum_pq[k]; + frame_wg_count += data.frame_wg_count[k]; + frame_wg_active += data.frame_wg_active[k]; + } + float avg_pq, max_pq; + if (frame_wg_active) { + avg_pq = (float) frame_sum_pq / (frame_wg_active * PQ_MAX); + max_pq = measure_peak(&data, params->percentile); + } else { + // Solid black frame + avg_pq = max_pq = PL_COLOR_HDR_BLACK; + } + + if (!obj->peak.avg_pq) { + // Set the initial value accordingly if it contains no data + obj->peak.avg_pq = avg_pq; + obj->peak.max_pq = max_pq; + } else { + // Ignore small deviations from existing peak (rounding error) + static const float epsilon = 1.0f / PQ_MAX; + if (fabsf(avg_pq - obj->peak.avg_pq) < epsilon) + avg_pq = obj->peak.avg_pq; + if (fabsf(max_pq - obj->peak.max_pq) < epsilon) + max_pq = obj->peak.max_pq; + } + + // Use an IIR low-pass filter to smooth out the detected values + const float coeff = iir_coeff(params->smoothing_period); + obj->peak.avg_pq += coeff * (avg_pq - obj->peak.avg_pq); + obj->peak.max_pq += coeff * (max_pq - obj->peak.max_pq); + + // Scene change hysteresis + if (params->scene_threshold_low > 0 && params->scene_threshold_high > 0) { + const float log10_pq = 1e-2f; // experimentally determined approximate + const float thresh_low = params->scene_threshold_low * log10_pq; + const float thresh_high = params->scene_threshold_high * log10_pq; + const float bias = (float) frame_wg_active / frame_wg_count; + const float delta = bias * fabsf(avg_pq - obj->peak.avg_pq); + const float mix_coeff = pl_smoothstep(thresh_low, thresh_high, delta); + obj->peak.avg_pq = PL_MIX(obj->peak.avg_pq, avg_pq, mix_coeff); + obj->peak.max_pq = PL_MIX(obj->peak.max_pq, max_pq, mix_coeff); + } +} + +bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp, + pl_shader_obj *state, + const struct pl_peak_detect_params *params) +{ + params = PL_DEF(params, &pl_peak_detect_default_params); + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return false; + + pl_gpu gpu = SH_GPU(sh); + if (!gpu || gpu->limits.max_ssbo_size < sizeof(struct peak_buf_data)) { + PL_ERR(sh, "HDR peak detection requires a GPU with support for at " + "least %zu bytes of SSBO data (supported: %zu)", + sizeof(struct peak_buf_data), gpu ? gpu->limits.max_ssbo_size : 0); + return false; + } + + const bool use_histogram = params->percentile > 0 && params->percentile < 100; + size_t shmem_req = 3 * sizeof(uint32_t); + if (use_histogram) + shmem_req += sizeof(uint32_t[HIST_BINS]); + + if (!sh_try_compute(sh, 16, 16, true, shmem_req)) { + PL_ERR(sh, "HDR peak detection requires compute shaders with support " + "for at least %zu bytes of shared memory! (avail: %zu)", + shmem_req, sh_glsl(sh).max_shmem_size); + return false; + } + + struct sh_color_map_obj *obj; + obj = SH_OBJ(sh, state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj, + sh_color_map_uninit); + if (!obj) + return false; + + if (peak_detect_params_eq(&obj->peak.params, params)) { + update_peak_buf(gpu, obj, true); // prevent over-writing previous frame + } else { + pl_reset_detected_peak(*state); + } + + pl_assert(!obj->peak.buf); + static const struct peak_buf_data zero = {0}; + +retry_ssbo: + if (obj->peak.readback) { + obj->peak.buf = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(struct peak_buf_data), + .storable = true, + .initial_data = &zero, + )); + } else { + obj->peak.buf = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(struct peak_buf_data), + .memory_type = PL_BUF_MEM_DEVICE, + .host_readable = true, + .storable = true, + .initial_data = &zero, + )); + } + + if (!obj->peak.buf && !obj->peak.readback) { + PL_WARN(sh, "Failed creating host-readable peak detection SSBO, " + "retrying with fallback buffer"); + obj->peak.readback = pl_buf_create(gpu, pl_buf_params( + .size = sizeof(struct peak_buf_data), + .host_readable = true, + )); + if (obj->peak.readback) + goto retry_ssbo; + } + + if (!obj->peak.buf) { + SH_FAIL(sh, "Failed creating peak detection SSBO!"); + return false; + } + + obj->peak.params = *params; + + sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = "PeakBuf", + .type = PL_DESC_BUF_STORAGE, + .access = PL_DESC_ACCESS_READWRITE, + }, + .binding.object = obj->peak.buf, + .buffer_vars = (struct pl_buffer_var *) peak_buf_vars, + .num_buffer_vars = PL_ARRAY_SIZE(peak_buf_vars), + }); + + sh_describe(sh, "peak detection"); + GLSL("// pl_shader_detect_peak \n" + "{ \n" + "const uint wg_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y; \n" + "uint wg_idx = gl_WorkGroupID.y * gl_NumWorkGroups.x + \n" + " gl_WorkGroupID.x; \n" + "uint slice = wg_idx %% %du; \n" + "vec4 color_orig = color; \n", + SLICES); + + // For performance, we want to do as few atomic operations on global + // memory as possible, so use an atomic in shmem for the work group. + ident_t wg_sum = sh_fresh(sh, "wg_sum"), + wg_max = sh_fresh(sh, "wg_max"), + wg_black = sh_fresh(sh, "wg_black"), + wg_hist = NULL_IDENT; + GLSLH("shared uint "$", "$", "$"; \n", wg_sum, wg_max, wg_black); + if (use_histogram) { + wg_hist = sh_fresh(sh, "wg_hist"); + GLSLH("shared uint "$"[%u]; \n", wg_hist, HIST_BINS); + GLSL("for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n" + " "$"[i] = 0u; \n", + HIST_BINS, wg_hist); + } + GLSL($" = 0u; "$" = 0u; "$" = 0u; \n" + "barrier(); \n", + wg_sum, wg_max, wg_black); + + // Decode color into linear light representation + pl_color_space_infer(&csp); + pl_shader_linearize(sh, &csp); + + // Measure luminance as N-bit PQ + GLSL("float luma = dot("$", color.rgb); \n" + "luma *= %f; \n" + "luma = pow(clamp(luma, 0.0, 1.0), %f); \n" + "luma = (%f + %f * luma) / (1.0 + %f * luma); \n" + "luma = pow(luma, %f); \n" + "luma *= smoothstep(0.0, 1e-2, luma); \n" + "uint y_pq = uint(%d.0 * luma); \n", + sh_luma_coeffs(sh, &csp), + PL_COLOR_SDR_WHITE / 10000.0, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2, + PQ_MAX); + + // Update the work group's shared atomics + bool has_subgroups = sh_glsl(sh).subgroup_size > 0; + if (use_histogram) { + GLSL("int bin = (int(y_pq) >> %d) - %d; \n" + "bin = clamp(bin, 0, %d); \n", + PQ_BITS - HIST_BITS, HIST_BIAS, + HIST_BINS - 1); + if (has_subgroups) { + // Optimize for the very common case of identical histogram bins + GLSL("if (subgroupAllEqual(bin)) { \n" + " if (subgroupElect()) \n" + " atomicAdd("$"[bin], gl_SubgroupSize); \n" + "} else { \n" + " atomicAdd("$"[bin], 1u); \n" + "} \n", + wg_hist, wg_hist); + } else { + GLSL("atomicAdd("$"[bin], 1u); \n", wg_hist); + } + } + + if (has_subgroups) { + GLSL("uint group_sum = subgroupAdd(y_pq); \n" + "uint group_max = subgroupMax(y_pq); \n" + "uvec4 b = subgroupBallot(y_pq == 0u); \n" + "if (subgroupElect()) { \n" + " atomicAdd("$", group_sum); \n" + " atomicMax("$", group_max); \n" + " atomicAdd("$", subgroupBallotBitCount(b));\n" + "} \n" + "barrier(); \n", + wg_sum, wg_max, wg_black); + } else { + GLSL("atomicAdd("$", y_pq); \n" + "atomicMax("$", y_pq); \n" + "if (y_pq == 0u) \n" + " atomicAdd("$", 1u); \n" + "barrier(); \n", + wg_sum, wg_max, wg_black); + } + + if (use_histogram) { + GLSL("if (gl_LocalInvocationIndex == 0u) \n" + " "$"[0] -= "$"; \n" + "for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n" + " atomicAdd(frame_hist[slice * %du + i], "$"[i]); \n", + wg_hist, wg_black, + HIST_BINS, + HIST_BINS, wg_hist); + } + + // Have one thread per work group update the global atomics + GLSL("if (gl_LocalInvocationIndex == 0u) { \n" + " uint num = wg_size - "$"; \n" + " atomicAdd(frame_wg_count[slice], 1u); \n" + " atomicAdd(frame_wg_active[slice], min(num, 1u)); \n" + " if (num > 0u) { \n" + " atomicAdd(frame_sum_pq[slice], "$" / num); \n" + " atomicMax(frame_max_pq[slice], "$"); \n" + " } \n" + "} \n" + "color = color_orig; \n" + "} \n", + wg_black, wg_sum, wg_max); + + return true; +} + +bool pl_get_detected_hdr_metadata(const pl_shader_obj state, + struct pl_hdr_metadata *out) +{ + if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP) + return false; + + struct sh_color_map_obj *obj = state->priv; + update_peak_buf(state->gpu, obj, false); + if (!obj->peak.avg_pq) + return false; + + out->max_pq_y = obj->peak.max_pq; + out->avg_pq_y = obj->peak.avg_pq; + return true; +} + +bool pl_get_detected_peak(const pl_shader_obj state, + float *out_peak, float *out_avg) +{ + struct pl_hdr_metadata data; + if (!pl_get_detected_hdr_metadata(state, &data)) + return false; + + // Preserves old behavior + *out_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.max_pq_y); + *out_avg = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.avg_pq_y); + return true; +} + +void pl_reset_detected_peak(pl_shader_obj state) +{ + if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP) + return; + + struct sh_color_map_obj *obj = state->priv; + pl_buf readback = obj->peak.readback; + pl_buf_destroy(state->gpu, &obj->peak.buf); + memset(&obj->peak, 0, sizeof(obj->peak)); + obj->peak.readback = readback; +} + +void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + sh_describe(sh, "feature extraction"); + pl_shader_linearize(sh, &csp); + GLSL("// pl_shader_extract_features \n" + "{ \n" + "vec3 lms = %f * "$" * color.rgb; \n" + "lms = pow(max(lms, 0.0), vec3(%f)); \n" + "lms = (vec3(%f) + %f * lms) \n" + " / (vec3(1.0) + %f * lms); \n" + "lms = pow(lms, vec3(%f)); \n" + "float I = dot(vec3(%f, %f, %f), lms); \n" + "color = vec4(I, 0.0, 0.0, 1.0); \n" + "} \n", + PL_COLOR_SDR_WHITE / 10000, + SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))), + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2, + pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]); +} + +const struct pl_color_map_params pl_color_map_default_params = { PL_COLOR_MAP_DEFAULTS }; +const struct pl_color_map_params pl_color_map_high_quality_params = { PL_COLOR_MAP_HQ_DEFAULTS }; + +static ident_t rect_pos(pl_shader sh, pl_rect2df rc) +{ + if (!rc.x0 && !rc.x1) + rc.x1 = 1.0f; + if (!rc.y0 && !rc.y1) + rc.y1 = 1.0f; + + return sh_attr_vec2(sh, "tone_map_coords", &(pl_rect2df) { + .x0 = -rc.x0 / (rc.x1 - rc.x0), + .x1 = (1.0f - rc.x0) / (rc.x1 - rc.x0), + .y0 = -rc.y1 / (rc.y0 - rc.y1), + .y1 = (1.0f - rc.y1) / (rc.y0 - rc.y1), + }); +} + +static void visualize_tone_map(pl_shader sh, pl_rect2df rc, float alpha, + const struct pl_tone_map_params *params) +{ + pl_assert(params->input_scaling == PL_HDR_PQ); + pl_assert(params->output_scaling == PL_HDR_PQ); + + GLSL("// Visualize tone mapping \n" + "{ \n" + "vec2 pos = "$"; \n" + "if (min(pos.x, pos.y) >= 0.0 && \n" // visualizer rect + " max(pos.x, pos.y) <= 1.0) \n" + "{ \n" + "float xmin = "$"; \n" + "float xmax = "$"; \n" + "float xavg = "$"; \n" + "float ymin = "$"; \n" + "float ymax = "$"; \n" + "float alpha = 0.8 * "$"; \n" + "vec3 viz = color.rgb; \n" + "float vv = tone_map(pos.x); \n" + // Color based on region + "if (pos.x < xmin || pos.x > xmax) { \n" // outside source + "} else if (pos.y < ymin || pos.y > ymax) {\n" // outside target + " if (pos.y < xmin || pos.y > xmax) { \n" // and also source + " viz = vec3(0.1, 0.1, 0.5); \n" + " } else { \n" + " viz = vec3(0.2, 0.05, 0.05); \n" // but inside source + " } \n" + "} else { \n" // inside domain + " if (abs(pos.x - pos.y) < 1e-3) { \n" // main diagonal + " viz = vec3(0.2); \n" + " } else if (pos.y < vv) { \n" // inside function + " alpha *= 0.6; \n" + " viz = vec3(0.05); \n" + " if (vv > pos.x && pos.y > pos.x) \n" // output brighter than input + " viz.rg = vec2(0.5, 0.7); \n" + " } else { \n" // outside function + " if (vv < pos.x && pos.y < pos.x) \n" // output darker than input + " viz = vec3(0.0, 0.1, 0.2); \n" + " } \n" + " if (pos.y > xmax) { \n" // inverse tone-mapping region + " vec3 hi = vec3(0.2, 0.5, 0.8); \n" + " viz = mix(viz, hi, 0.5); \n" + " } else if (pos.y < xmin) { \n" // black point region + " viz = mix(viz, vec3(0.0), 0.3); \n" + " } \n" + " if (xavg > 0.0 && abs(pos.x - xavg) < 1e-3)\n" // source avg brightness + " viz = vec3(0.5); \n" + "} \n" + "color.rgb = mix(color.rgb, viz, alpha); \n" + "} \n" + "} \n", + rect_pos(sh, rc), + SH_FLOAT_DYN(params->input_min), + SH_FLOAT_DYN(params->input_max), + SH_FLOAT_DYN(params->input_avg), + SH_FLOAT(params->output_min), + SH_FLOAT_DYN(params->output_max), + SH_FLOAT_DYN(alpha)); +} + +static void visualize_gamut_map(pl_shader sh, pl_rect2df rc, + ident_t lut, float hue, float theta, + const struct pl_gamut_map_params *params) +{ + ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms); + ident_t lms2rgb_src = SH_MAT3(pl_ipt_lms2rgb(¶ms->input_gamut)); + ident_t lms2rgb_dst = SH_MAT3(pl_ipt_lms2rgb(¶ms->output_gamut)); + + GLSL("// Visualize gamut mapping \n" + "vec2 pos = "$"; \n" + "float pqmin = "$"; \n" + "float pqmax = "$"; \n" + "float rgbmin = "$"; \n" + "float rgbmax = "$"; \n" + "vec3 orig = ipt; \n" + "if (min(pos.x, pos.y) >= 0.0 && \n" + " max(pos.x, pos.y) <= 1.0) \n" + "{ \n" + // Source color to visualize + "float mid = mix(pqmin, pqmax, 0.6); \n" + "vec3 base = vec3(0.5, 0.0, 0.0); \n" + "float hue = "$", theta = "$"; \n" + "base.x = mix(base.x, mid, sin(theta)); \n" + "mat3 rot1 = mat3(1.0, 0.0, 0.0, \n" + " 0.0, cos(hue), sin(hue), \n" + " 0.0, -sin(hue), cos(hue)); \n" + "mat3 rot2 = mat3( cos(theta), 0.0, sin(theta), \n" + " 0.0, 1.0, 0.0, \n" + " -sin(theta), 0.0, cos(theta)); \n" + "vec3 dir = vec3(pos.yx - vec2(0.5), 0.0); \n" + "ipt = base + rot1 * rot2 * dir; \n" + // Convert back to RGB (for gamut boundary testing) + "lmspq = "$" * ipt; \n" + "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n" + "lms = max(lms - vec3(%f), 0.0) \n" + " / (vec3(%f) - %f * lms); \n" + "lms = pow(lms, vec3(1.0/%f)); \n" + "lms *= %f; \n" + // Check against src/dst gamut boundaries + "vec3 rgbsrc = "$" * lms; \n" + "vec3 rgbdst = "$" * lms; \n" + "bool insrc, indst; \n" + "insrc = all(lessThan(rgbsrc, vec3(rgbmax))) && \n" + " all(greaterThan(rgbsrc, vec3(rgbmin))); \n" + "indst = all(lessThan(rgbdst, vec3(rgbmax))) && \n" + " all(greaterThan(rgbdst, vec3(rgbmin))); \n" + // Sample from gamut mapping 3DLUT + "idx.x = (ipt.x - pqmin) / (pqmax - pqmin); \n" + "idx.y = 2.0 * length(ipt.yz); \n" + "idx.z = %f * atan(ipt.z, ipt.y) + 0.5; \n" + "vec3 mapped = "$"(idx).xyz; \n" + "mapped.yz -= vec2(32768.0/65535.0); \n" + "float mappedhue = atan(mapped.z, mapped.y); \n" + "float mappedchroma = length(mapped.yz); \n" + "ipt = mapped; \n" + // Visualize gamuts + "if (!insrc && !indst) { \n" + " ipt = orig; \n" + "} else if (insrc && !indst) { \n" + " ipt.x -= 0.1; \n" + "} else if (indst && !insrc) { \n" + " ipt.x += 0.1; \n" + "} \n" + // Visualize iso-luminance and iso-hue lines + "vec3 line; \n" + "if (insrc && fract(50.0 * mapped.x) < 1e-1) { \n" + " float k = smoothstep(0.1, 0.0, abs(sin(theta))); \n" + " line.x = mix(mapped.x, 0.3, 0.5); \n" + " line.yz = sqrt(length(mapped.yz)) * \n" + " normalize(mapped.yz); \n" + " ipt = mix(ipt, line, k); \n" + "} \n" + "if (insrc && fract(10.0 * (mappedhue - hue)) < 1e-1) {\n" + " float k = smoothstep(0.3, 0.0, abs(cos(theta))); \n" + " line.x = mapped.x - 0.05; \n" + " line.yz = 1.2 * mapped.yz; \n" + " ipt = mix(ipt, line, k); \n" + "} \n" + "if (insrc && fract(100.0 * mappedchroma) < 1e-1) { \n" + " line.x = mapped.x + 0.1; \n" + " line.yz = 0.4 * mapped.yz; \n" + " ipt = mix(ipt, line, 0.5); \n" + "} \n" + "} \n", + rect_pos(sh, rc), + SH_FLOAT(params->min_luma), SH_FLOAT(params->max_luma), + SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->min_luma)), + SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->max_luma)), + SH_FLOAT_DYN(hue), SH_FLOAT_DYN(theta), + ipt2lms, + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + 10000 / PL_COLOR_SDR_WHITE, + lms2rgb_src, + lms2rgb_dst, + 0.5f / M_PI, + lut); +} + +static void fill_tone_lut(void *data, const struct sh_lut_params *params) +{ + const struct pl_tone_map_params *lut_params = params->priv; + pl_tone_map_generate(data, lut_params); +} + +static void fill_gamut_lut(void *data, const struct sh_lut_params *params) +{ + const struct pl_gamut_map_params *lut_params = params->priv; + const int lut_size = params->width * params->height * params->depth; + void *tmp = pl_alloc(NULL, lut_size * sizeof(float) * lut_params->lut_stride); + pl_gamut_map_generate(tmp, lut_params); + + // Convert to 16-bit unsigned integer for GPU texture + const float *in = tmp; + uint16_t *out = data; + pl_assert(lut_params->lut_stride == 3); + pl_assert(params->comps == 4); + for (int i = 0; i < lut_size; i++) { + out[0] = roundf(in[0] * UINT16_MAX); + out[1] = roundf(in[1] * UINT16_MAX + (UINT16_MAX >> 1)); + out[2] = roundf(in[2] * UINT16_MAX + (UINT16_MAX >> 1)); + in += 3; + out += 4; + } + + pl_free(tmp); +} + +void pl_shader_color_map_ex(pl_shader sh, const struct pl_color_map_params *params, + const struct pl_color_map_args *args) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + struct pl_color_space src = args->src, dst = args->dst; + pl_color_space_infer_map(&src, &dst); + if (pl_color_space_equal(&src, &dst)) { + if (args->prelinearized) + pl_shader_delinearize(sh, &dst); + return; + } + + struct sh_color_map_obj *obj = NULL; + if (args->state) { + pl_get_detected_hdr_metadata(*args->state, &src.hdr); + obj = SH_OBJ(sh, args->state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj, + sh_color_map_uninit); + if (!obj) + return; + } + + params = PL_DEF(params, &pl_color_map_default_params); + GLSL("// pl_shader_color_map \n" + "{ \n"); + + struct pl_tone_map_params tone = { + .function = PL_DEF(params->tone_mapping_function, &pl_tone_map_clip), + .constants = params->tone_constants, + .param = params->tone_mapping_param, + .input_scaling = PL_HDR_PQ, + .output_scaling = PL_HDR_PQ, + .lut_size = PL_DEF(params->lut_size, pl_color_map_default_params.lut_size), + .hdr = src.hdr, + }; + + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &src, + .metadata = params->metadata, + .scaling = tone.input_scaling, + .out_min = &tone.input_min, + .out_max = &tone.input_max, + .out_avg = &tone.input_avg, + )); + + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &dst, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = tone.output_scaling, + .out_min = &tone.output_min, + .out_max = &tone.output_max, + )); + + pl_tone_map_params_infer(&tone); + + // Round sufficiently similar values + if (fabs(tone.input_max - tone.output_max) < 1e-6) + tone.output_max = tone.input_max; + if (fabs(tone.input_min - tone.output_min) < 1e-6) + tone.output_min = tone.input_min; + + if (!params->inverse_tone_mapping) { + // Never exceed the source unless requested, but still allow + // black point adaptation + tone.output_max = PL_MIN(tone.output_max, tone.input_max); + } + + const int *lut3d_size_def = pl_color_map_default_params.lut3d_size; + struct pl_gamut_map_params gamut = { + .function = PL_DEF(params->gamut_mapping, &pl_gamut_map_clip), + .constants = params->gamut_constants, + .input_gamut = src.hdr.prim, + .output_gamut = dst.hdr.prim, + .lut_size_I = PL_DEF(params->lut3d_size[0], lut3d_size_def[0]), + .lut_size_C = PL_DEF(params->lut3d_size[1], lut3d_size_def[1]), + .lut_size_h = PL_DEF(params->lut3d_size[2], lut3d_size_def[2]), + .lut_stride = 3, + }; + + float src_peak_static; + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &src, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_PQ, + .out_max = &src_peak_static, + )); + + pl_color_space_nominal_luma_ex(pl_nominal_luma_params( + .color = &dst, + .metadata = PL_HDR_METADATA_HDR10, + .scaling = PL_HDR_PQ, + .out_min = &gamut.min_luma, + .out_max = &gamut.max_luma, + )); + + // Clip the gamut mapping output to the input gamut if disabled + if (!params->gamut_expansion && gamut.function->bidirectional) { + if (pl_primaries_compatible(&gamut.input_gamut, &gamut.output_gamut)) { + gamut.output_gamut = pl_primaries_clip(&gamut.output_gamut, + &gamut.input_gamut); + } + } + + // Backwards compatibility with older API + switch (params->gamut_mode) { + case PL_GAMUT_CLIP: + switch (params->intent) { + case PL_INTENT_AUTO: + case PL_INTENT_PERCEPTUAL: + case PL_INTENT_RELATIVE_COLORIMETRIC: + break; // leave default + case PL_INTENT_SATURATION: + gamut.function = &pl_gamut_map_saturation; + break; + case PL_INTENT_ABSOLUTE_COLORIMETRIC: + gamut.function = &pl_gamut_map_absolute; + break; + } + break; + case PL_GAMUT_DARKEN: + gamut.function = &pl_gamut_map_darken; + break; + case PL_GAMUT_WARN: + gamut.function = &pl_gamut_map_highlight; + break; + case PL_GAMUT_DESATURATE: + gamut.function = &pl_gamut_map_desaturate; + break; + case PL_GAMUT_MODE_COUNT: + pl_unreachable(); + } + + bool can_fast = !params->force_tone_mapping_lut; + if (!args->state) { + // No state object provided, forcibly disable advanced methods + can_fast = true; + if (tone.function != &pl_tone_map_clip) + tone.function = &pl_tone_map_linear; + if (gamut.function != &pl_gamut_map_clip) + gamut.function = &pl_gamut_map_saturation; + } + + pl_fmt gamut_fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR); + if (!gamut_fmt) { + gamut.function = &pl_gamut_map_saturation; + can_fast = true; + } + + bool need_tone_map = !pl_tone_map_params_noop(&tone); + bool need_gamut_map = !pl_gamut_map_params_noop(&gamut); + + if (!args->prelinearized) + pl_shader_linearize(sh, &src); + + pl_matrix3x3 rgb2lms = pl_ipt_rgb2lms(pl_raw_primaries_get(src.primaries)); + pl_matrix3x3 lms2rgb = pl_ipt_lms2rgb(pl_raw_primaries_get(dst.primaries)); + ident_t lms2ipt = SH_MAT3(pl_ipt_lms2ipt); + ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms); + + if (need_gamut_map && gamut.function == &pl_gamut_map_saturation && can_fast) { + const pl_matrix3x3 lms2src = pl_ipt_lms2rgb(&gamut.input_gamut); + const pl_matrix3x3 dst2lms = pl_ipt_rgb2lms(&gamut.output_gamut); + sh_describe(sh, "gamut map (saturation)"); + pl_matrix3x3_mul(&lms2rgb, &dst2lms); + pl_matrix3x3_mul(&lms2rgb, &lms2src); + need_gamut_map = false; + } + + // Fast path: simply convert between primaries (if needed) + if (!need_tone_map && !need_gamut_map) { + if (src.primaries != dst.primaries) { + sh_describe(sh, "colorspace conversion"); + pl_matrix3x3_mul(&lms2rgb, &rgb2lms); + GLSL("color.rgb = "$" * color.rgb; \n", SH_MAT3(lms2rgb)); + } + goto done; + } + + // Full path: convert input from normalized RGB to IPT + GLSL("vec3 lms = "$" * color.rgb; \n" + "vec3 lmspq = %f * lms; \n" + "lmspq = pow(max(lmspq, 0.0), vec3(%f)); \n" + "lmspq = (vec3(%f) + %f * lmspq) \n" + " / (vec3(1.0) + %f * lmspq); \n" + "lmspq = pow(lmspq, vec3(%f)); \n" + "vec3 ipt = "$" * lmspq; \n" + "float i_orig = ipt.x; \n", + SH_MAT3(rgb2lms), + PL_COLOR_SDR_WHITE / 10000, + PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2, + lms2ipt); + + if (params->show_clipping) { + const float eps = 1e-6f; + GLSL("bool clip_hi, clip_lo; \n" + "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n" + "clip_lo = any(lessThan(color.rgb, vec3("$"))); \n" + "clip_hi = clip_hi || ipt.x > "$"; \n" + "clip_lo = clip_lo || ipt.x < "$"; \n", + SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps), + SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps), + SH_FLOAT_DYN(tone.input_max + eps), + SH_FLOAT(tone.input_min - eps)); + } + + if (need_tone_map) { + const struct pl_tone_map_function *fun = tone.function; + sh_describef(sh, "%s tone map (%.0f -> %.0f)", fun->name, + pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.input_max), + pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.output_max)); + + if (fun == &pl_tone_map_clip && can_fast) { + + GLSL("#define tone_map(x) clamp((x), "$", "$") \n", + SH_FLOAT(tone.input_min), + SH_FLOAT_DYN(tone.input_max)); + + } else if (fun == &pl_tone_map_linear && can_fast) { + + const float gain = tone.constants.exposure; + const float scale = tone.input_max - tone.input_min; + + ident_t linfun = sh_fresh(sh, "linear_pq"); + GLSLH("float "$"(float x) { \n" + // Stretch the input range (while clipping) + " x = "$" * x + "$"; \n" + " x = clamp(x, 0.0, 1.0); \n" + " x = "$" * x + "$"; \n" + " return x; \n" + "} \n", + linfun, + SH_FLOAT_DYN(gain / scale), + SH_FLOAT_DYN(-gain / scale * tone.input_min), + SH_FLOAT_DYN(tone.output_max - tone.output_min), + SH_FLOAT(tone.output_min)); + + GLSL("#define tone_map(x) ("$"(x)) \n", linfun); + + } else { + + pl_assert(obj); + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->tone.lut, + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_AUTO, + .method = SH_LUT_LINEAR, + .width = tone.lut_size, + .comps = 1, + .update = !pl_tone_map_params_equal(&tone, &obj->tone.params), + .dynamic = tone.input_avg > 0, // dynamic metadata + .fill = fill_tone_lut, + .priv = &tone, + )); + obj->tone.params = tone; + if (!lut) { + SH_FAIL(sh, "Failed generating tone-mapping LUT!"); + return; + } + + const float lut_range = tone.input_max - tone.input_min; + GLSL("#define tone_map(x) ("$"("$" * (x) + "$")) \n", + lut, SH_FLOAT_DYN(1.0f / lut_range), + SH_FLOAT_DYN(-tone.input_min / lut_range)); + + } + + bool need_recovery = tone.input_max >= tone.output_max; + if (need_recovery && params->contrast_recovery && args->feature_map) { + ident_t pos, pt; + ident_t lowres = sh_bind(sh, args->feature_map, PL_TEX_ADDRESS_CLAMP, + PL_TEX_SAMPLE_LINEAR, "feature_map", + NULL, &pos, &pt); + + // Obtain HF detail map from bicubic interpolation of LF features + GLSL("vec2 lpos = "$"; \n" + "vec2 lpt = "$"; \n" + "vec2 lsize = vec2(textureSize("$", 0)); \n" + "vec2 frac = fract(lpos * lsize + vec2(0.5)); \n" + "vec2 frac2 = frac * frac; \n" + "vec2 inv = vec2(1.0) - frac; \n" + "vec2 inv2 = inv * inv; \n" + "vec2 w0 = 1.0/6.0 * inv2 * inv; \n" + "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \n" + "vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n" + "vec2 w3 = 1.0/6.0 * frac2 * frac; \n" + "vec4 g = vec4(w0 + w1, w2 + w3); \n" + "vec4 h = vec4(w1, w3) / g + inv.xyxy; \n" + "h.xy -= vec2(2.0); \n" + "vec4 p = lpos.xyxy + lpt.xyxy * h; \n" + "float l00 = textureLod("$", p.xy, 0.0).r; \n" + "float l01 = textureLod("$", p.xw, 0.0).r; \n" + "float l0 = mix(l01, l00, g.y); \n" + "float l10 = textureLod("$", p.zy, 0.0).r; \n" + "float l11 = textureLod("$", p.zw, 0.0).r; \n" + "float l1 = mix(l11, l10, g.y); \n" + "float luma = mix(l1, l0, g.x); \n" + // Mix low-resolution tone mapped image with high-resolution + // tone mapped image according to desired strength. + "float highres = clamp(ipt.x, 0.0, 1.0); \n" + "float lowres = clamp(luma, 0.0, 1.0); \n" + "float detail = highres - lowres; \n" + "float base = tone_map(highres); \n" + "float sharp = tone_map(lowres) + detail; \n" + "ipt.x = clamp(mix(base, sharp, "$"), "$", "$"); \n", + pos, pt, lowres, + lowres, lowres, lowres, lowres, + SH_FLOAT(params->contrast_recovery), + SH_FLOAT(tone.output_min), SH_FLOAT_DYN(tone.output_max)); + + } else { + + GLSL("ipt.x = tone_map(ipt.x); \n"); + } + + // Avoid raising saturation excessively when raising brightness, and + // also desaturate when reducing brightness greatly to account for the + // reduction in gamut volume. + GLSL("vec2 hull = vec2(i_orig, ipt.x); \n" + "hull = ((hull - 6.0) * hull + 9.0) * hull; \n" + "ipt.yz *= min(i_orig / ipt.x, hull.y / hull.x); \n"); + } + + if (need_gamut_map) { + const struct pl_gamut_map_function *fun = gamut.function; + sh_describef(sh, "gamut map (%s)", fun->name); + + pl_assert(obj); + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->gamut.lut, + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .fmt = gamut_fmt, + .method = params->lut3d_tricubic ? SH_LUT_CUBIC : SH_LUT_LINEAR, + .width = gamut.lut_size_I, + .height = gamut.lut_size_C, + .depth = gamut.lut_size_h, + .comps = 4, + .signature = gamut_map_signature(&gamut), + .cache = SH_CACHE(sh), + .fill = fill_gamut_lut, + .priv = &gamut, + )); + if (!lut) { + SH_FAIL(sh, "Failed generating gamut-mapping LUT!"); + return; + } + + // 3D LUT lookup (in ICh space) + const float lut_range = gamut.max_luma - gamut.min_luma; + GLSL("vec3 idx; \n" + "idx.x = "$" * ipt.x + "$"; \n" + "idx.y = 2.0 * length(ipt.yz); \n" + "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;\n" + "ipt = "$"(idx).xyz; \n" + "ipt.yz -= vec2(32768.0/65535.0); \n", + SH_FLOAT(1.0f / lut_range), + SH_FLOAT(-gamut.min_luma / lut_range), + 0.5f / M_PI, lut); + + if (params->show_clipping) { + GLSL("clip_lo = clip_lo || any(lessThan(idx, vec3(0.0))); \n" + "clip_hi = clip_hi || any(greaterThan(idx, vec3(1.0))); \n"); + } + + if (params->visualize_lut) { + visualize_gamut_map(sh, params->visualize_rect, lut, + params->visualize_hue, params->visualize_theta, + &gamut); + } + } + + // Convert IPT back to linear RGB + GLSL("lmspq = "$" * ipt; \n" + "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n" + "lms = max(lms - vec3(%f), 0.0) \n" + " / (vec3(%f) - %f * lms); \n" + "lms = pow(lms, vec3(1.0/%f)); \n" + "lms *= %f; \n" + "color.rgb = "$" * lms; \n", + ipt2lms, + PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, + 10000 / PL_COLOR_SDR_WHITE, + SH_MAT3(lms2rgb)); + + if (params->show_clipping) { + GLSL("if (clip_hi) { \n" + " float k = dot(color.rgb, vec3(2.0 / 3.0)); \n" + " color.rgb = clamp(vec3(k) - color.rgb, 0.0, 1.0); \n" + " float cmin = min(min(color.r, color.g), color.b); \n" + " float cmax = max(max(color.r, color.g), color.b); \n" + " float delta = cmax - cmin; \n" + " vec3 sat = smoothstep(cmin - 1e-6, cmax, color.rgb); \n" + " const vec3 red = vec3(1.0, 0.0, 0.0); \n" + " color.rgb = mix(red, sat, smoothstep(0.0, 0.3, delta)); \n" + "} else if (clip_lo) { \n" + " vec3 hi = vec3(0.0, 0.3, 0.3); \n" + " color.rgb = mix(color.rgb, hi, 0.5); \n" + "} \n"); + } + + if (need_tone_map) { + if (params->visualize_lut) { + float alpha = need_gamut_map ? powf(cosf(params->visualize_theta), 5.0f) : 1.0f; + visualize_tone_map(sh, params->visualize_rect, alpha, &tone); + } + GLSL("#undef tone_map \n"); + } + +done: + pl_shader_delinearize(sh, &dst); + GLSL("}\n"); +} + +// Backwards compatibility wrapper around `pl_shader_color_map_ex` +void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params, + struct pl_color_space src, struct pl_color_space dst, + pl_shader_obj *state, bool prelinearized) +{ + pl_shader_color_map_ex(sh, params, pl_color_map_args( + .src = src, + .dst = dst, + .prelinearized = prelinearized, + .state = state, + .feature_map = NULL + )); +} + +void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp, + const struct pl_cone_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + if (!params || !params->cones) + return; + + sh_describe(sh, "cone distortion"); + GLSL("// pl_shader_cone_distort\n"); + GLSL("{\n"); + + pl_color_space_infer(&csp); + pl_shader_linearize(sh, &csp); + + pl_matrix3x3 cone_mat; + cone_mat = pl_get_cone_matrix(params, pl_raw_primaries_get(csp.primaries)); + GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("cone_mat"), + .data = PL_TRANSPOSE_3X3(cone_mat.m), + })); + + pl_shader_delinearize(sh, &csp); + GLSL("}\n"); +} diff --git a/src/shaders/custom.c b/src/shaders/custom.c new file mode 100644 index 0000000..3f03e57 --- /dev/null +++ b/src/shaders/custom.c @@ -0,0 +1,89 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" + +#include <libplacebo/shaders/custom.h> + +bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params) +{ + if (params->compute) { + int bw = PL_DEF(params->compute_group_size[0], 16); + int bh = PL_DEF(params->compute_group_size[1], 16); + bool flex = !params->compute_group_size[0] || + !params->compute_group_size[1]; + if (!sh_try_compute(sh, bw, bh, flex, params->compute_shmem)) + return false; + } + + if (!sh_require(sh, params->input, params->output_w, params->output_h)) + return false; + + sh->output = params->output; + + for (int i = 0; i < params->num_variables; i++) { + struct pl_shader_var sv = params->variables[i]; + GLSLP("#define %s "$"\n", sv.var.name, sh_var(sh, sv)); + } + + for (int i = 0; i < params->num_descriptors; i++) { + struct pl_shader_desc sd = params->descriptors[i]; + GLSLP("#define %s "$"\n", sd.desc.name, sh_desc(sh, sd)); + } + + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct pl_shader_va sva = params->vertex_attribs[i]; + GLSLP("#define %s "$"\n", sva.attr.name, sh_attr(sh, sva)); + } + + for (int i = 0; i < params->num_constants; i++) { + struct pl_shader_const sc = params->constants[i]; + GLSLP("#define %s "$"\n", sc.name, sh_const(sh, sc)); + } + + if (params->prelude) + GLSLP("// pl_shader_custom prelude: \n%s\n", params->prelude); + if (params->header) + GLSLH("// pl_shader_custom header: \n%s\n", params->header); + + if (params->description) + sh_describef(sh, "%s", params->description); + + if (params->body) { + const char *output_decl = ""; + if (params->output != params->input) { + switch (params->output) { + case PL_SHADER_SIG_NONE: break; + case PL_SHADER_SIG_COLOR: + output_decl = "vec4 color = vec4(0.0);"; + break; + + case PL_SHADER_SIG_SAMPLER: + pl_unreachable(); + } + } + + GLSL("// pl_shader_custom \n" + "%s \n" + "{ \n" + "%s \n" + "} \n", + output_decl, params->body); + } + + return true; +} diff --git a/src/shaders/custom_mpv.c b/src/shaders/custom_mpv.c new file mode 100644 index 0000000..4ef0817 --- /dev/null +++ b/src/shaders/custom_mpv.c @@ -0,0 +1,1768 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include <limits.h> + +#include "gpu.h" +#include "shaders.h" + +#include <libplacebo/shaders/colorspace.h> +#include <libplacebo/shaders/custom.h> + +// Hard-coded size limits, mainly for convenience (to avoid dynamic memory) +#define SHADER_MAX_HOOKS 16 +#define SHADER_MAX_BINDS 16 +#define MAX_SHEXP_SIZE 32 + +enum shexp_op { + SHEXP_OP_ADD, + SHEXP_OP_SUB, + SHEXP_OP_MUL, + SHEXP_OP_DIV, + SHEXP_OP_MOD, + SHEXP_OP_NOT, + SHEXP_OP_GT, + SHEXP_OP_LT, + SHEXP_OP_EQ, +}; + +enum shexp_tag { + SHEXP_END = 0, // End of an RPN expression + SHEXP_CONST, // Push a constant value onto the stack + SHEXP_TEX_W, // Get the width/height of a named texture (variable) + SHEXP_TEX_H, + SHEXP_OP2, // Pop two elements and push the result of a dyadic operation + SHEXP_OP1, // Pop one element and push the result of a monadic operation + SHEXP_VAR, // Arbitrary variable (e.g. shader parameters) +}; + +struct shexp { + enum shexp_tag tag; + union { + float cval; + pl_str varname; + enum shexp_op op; + } val; +}; + +struct custom_shader_hook { + // Variable/literal names of textures + pl_str pass_desc; + pl_str hook_tex[SHADER_MAX_HOOKS]; + pl_str bind_tex[SHADER_MAX_BINDS]; + pl_str save_tex; + + // Shader body itself + metadata + pl_str pass_body; + float offset[2]; + bool offset_align; + int comps; + + // Special expressions governing the output size and execution conditions + struct shexp width[MAX_SHEXP_SIZE]; + struct shexp height[MAX_SHEXP_SIZE]; + struct shexp cond[MAX_SHEXP_SIZE]; + + // Special metadata for compute shaders + bool is_compute; + int block_w, block_h; // Block size (each block corresponds to one WG) + int threads_w, threads_h; // How many threads form a WG +}; + +static bool parse_rpn_shexpr(pl_str line, struct shexp out[MAX_SHEXP_SIZE]) +{ + int pos = 0; + + while (line.len > 0) { + pl_str word = pl_str_split_char(line, ' ', &line); + if (word.len == 0) + continue; + + if (pos >= MAX_SHEXP_SIZE) + return false; + + struct shexp *exp = &out[pos++]; + + if (pl_str_eatend0(&word, ".w") || pl_str_eatend0(&word, ".width")) { + exp->tag = SHEXP_TEX_W; + exp->val.varname = word; + continue; + } + + if (pl_str_eatend0(&word, ".h") || pl_str_eatend0(&word, ".height")) { + exp->tag = SHEXP_TEX_H; + exp->val.varname = word; + continue; + } + + switch (word.buf[0]) { + case '+': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_ADD; continue; + case '-': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_SUB; continue; + case '*': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MUL; continue; + case '/': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_DIV; continue; + case '%': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MOD; continue; + case '!': exp->tag = SHEXP_OP1; exp->val.op = SHEXP_OP_NOT; continue; + case '>': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_GT; continue; + case '<': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_LT; continue; + case '=': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_EQ; continue; + } + + if (word.buf[0] >= '0' && word.buf[0] <= '9') { + exp->tag = SHEXP_CONST; + if (!pl_str_parse_float(word, &exp->val.cval)) + return false; + continue; + } + + // Treat as generic variable + exp->tag = SHEXP_VAR; + exp->val.varname = word; + } + + return true; +} + +static inline pl_str split_magic(pl_str *body) +{ + pl_str ret = pl_str_split_str0(*body, "//!", body); + if (body->len) { + // Make sure the separator is included in the remainder + body->buf -= 3; + body->len += 3; + } + + return ret; +} + +static bool parse_hook(pl_log log, pl_str *body, struct custom_shader_hook *out) +{ + *out = (struct custom_shader_hook){ + .pass_desc = pl_str0("unknown user shader"), + .width = {{ SHEXP_TEX_W, { .varname = pl_str0("HOOKED") }}}, + .height = {{ SHEXP_TEX_H, { .varname = pl_str0("HOOKED") }}}, + .cond = {{ SHEXP_CONST, { .cval = 1.0 }}}, + }; + + int hook_idx = 0; + int bind_idx = 0; + + // Parse all headers + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + // Check for the presence of the magic line beginning + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + // Parse the supported commands + if (pl_str_eatstart0(&line, "HOOK")) { + if (hook_idx == SHADER_MAX_HOOKS) { + pl_err(log, "Passes may only hook up to %d textures!", + SHADER_MAX_HOOKS); + return false; + } + out->hook_tex[hook_idx++] = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "BIND")) { + if (bind_idx == SHADER_MAX_BINDS) { + pl_err(log, "Passes may only bind up to %d textures!", + SHADER_MAX_BINDS); + return false; + } + out->bind_tex[bind_idx++] = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "SAVE")) { + pl_str save_tex = pl_str_strip(line); + if (pl_str_equals0(save_tex, "HOOKED")) { + // This is a special name that means "overwrite existing" + // texture, which we just signal by not having any `save_tex` + // name set. + out->save_tex = (pl_str) {0}; + } else if (pl_str_equals0(save_tex, "MAIN")) { + // Compatibility alias + out->save_tex = pl_str0("MAINPRESUB"); + } else { + out->save_tex = save_tex; + }; + continue; + } + + if (pl_str_eatstart0(&line, "DESC")) { + out->pass_desc = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "OFFSET")) { + line = pl_str_strip(line); + if (pl_str_equals0(line, "ALIGN")) { + out->offset_align = true; + } else { + if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[0]) || + !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[1]) || + line.len) + { + pl_err(log, "Error while parsing OFFSET!"); + return false; + } + } + continue; + } + + if (pl_str_eatstart0(&line, "WIDTH")) { + if (!parse_rpn_shexpr(line, out->width)) { + pl_err(log, "Error while parsing WIDTH!"); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "HEIGHT")) { + if (!parse_rpn_shexpr(line, out->height)) { + pl_err(log, "Error while parsing HEIGHT!"); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "WHEN")) { + if (!parse_rpn_shexpr(line, out->cond)) { + pl_err(log, "Error while parsing WHEN!"); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "COMPONENTS")) { + if (!pl_str_parse_int(pl_str_strip(line), &out->comps)) { + pl_err(log, "Error parsing COMPONENTS: '%.*s'", PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "COMPUTE")) { + line = pl_str_strip(line); + bool ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_w) && + pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_h); + + line = pl_str_strip(line); + if (ok && line.len) { + ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_w) && + pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_h) && + !line.len; + } else { + out->threads_w = out->block_w; + out->threads_h = out->block_h; + } + + if (!ok) { + pl_err(log, "Error while parsing COMPUTE!"); + return false; + } + + out->is_compute = true; + continue; + } + + // Unknown command type + pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + } + + // The rest of the file up until the next magic line beginning (if any) + // shall be the shader body + out->pass_body = split_magic(body); + + // Sanity checking + if (hook_idx == 0) + pl_warn(log, "Pass has no hooked textures (will be ignored)!"); + + return true; +} + +static bool parse_tex(pl_gpu gpu, void *alloc, pl_str *body, + struct pl_shader_desc *out) +{ + *out = (struct pl_shader_desc) { + .desc = { + .name = "USER_TEX", + .type = PL_DESC_SAMPLED_TEX, + }, + }; + + struct pl_tex_params params = { + .w = 1, .h = 1, .d = 0, + .sampleable = true, + .debug_tag = PL_DEBUG_TAG, + }; + + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + if (pl_str_eatstart0(&line, "TEXTURE")) { + out->desc.name = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "SIZE")) { + line = pl_str_strip(line); + int dims = 0; + int dim[4]; // extra space to catch invalid extra entries + while (line.len && dims < PL_ARRAY_SIZE(dim)) { + if (!pl_str_parse_int(pl_str_split_char(line, ' ', &line), &dim[dims++])) { + PL_ERR(gpu, "Error while parsing SIZE!"); + return false; + } + } + + uint32_t lim = dims == 1 ? gpu->limits.max_tex_1d_dim + : dims == 2 ? gpu->limits.max_tex_2d_dim + : dims == 3 ? gpu->limits.max_tex_3d_dim + : 0; + + // Sanity check against GPU size limits + switch (dims) { + case 3: + params.d = dim[2]; + if (params.d < 1 || params.d > lim) { + PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!", + params.d, lim); + return false; + } + // fall through + case 2: + params.h = dim[1]; + if (params.h < 1 || params.h > lim) { + PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!", + params.h, lim); + return false; + } + // fall through + case 1: + params.w = dim[0]; + if (params.w < 1 || params.w > lim) { + PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!", + params.w, lim); + return false; + } + break; + + default: + PL_ERR(gpu, "Invalid number of texture dimensions!"); + return false; + }; + + // Clear out the superfluous components + if (dims < 3) + params.d = 0; + if (dims < 2) + params.h = 0; + continue; + } + + if (pl_str_eatstart0(&line, "FORMAT")) { + line = pl_str_strip(line); + params.format = NULL; + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt fmt = gpu->formats[n]; + if (pl_str_equals0(line, fmt->name)) { + params.format = fmt; + break; + } + } + + if (!params.format || params.format->opaque) { + PL_ERR(gpu, "Unrecognized/unavailable FORMAT name: '%.*s'!", + PL_STR_FMT(line)); + return false; + } + + if (!(params.format->caps & PL_FMT_CAP_SAMPLEABLE)) { + PL_ERR(gpu, "Chosen FORMAT '%.*s' is not sampleable!", + PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "FILTER")) { + line = pl_str_strip(line); + if (pl_str_equals0(line, "LINEAR")) { + out->binding.sample_mode = PL_TEX_SAMPLE_LINEAR; + } else if (pl_str_equals0(line, "NEAREST")) { + out->binding.sample_mode = PL_TEX_SAMPLE_NEAREST; + } else { + PL_ERR(gpu, "Unrecognized FILTER: '%.*s'!", PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "BORDER")) { + line = pl_str_strip(line); + if (pl_str_equals0(line, "CLAMP")) { + out->binding.address_mode = PL_TEX_ADDRESS_CLAMP; + } else if (pl_str_equals0(line, "REPEAT")) { + out->binding.address_mode = PL_TEX_ADDRESS_REPEAT; + } else if (pl_str_equals0(line, "MIRROR")) { + out->binding.address_mode = PL_TEX_ADDRESS_MIRROR; + } else { + PL_ERR(gpu, "Unrecognized BORDER: '%.*s'!", PL_STR_FMT(line)); + return false; + } + continue; + } + + if (pl_str_eatstart0(&line, "STORAGE")) { + params.storable = true; + out->desc.type = PL_DESC_STORAGE_IMG; + out->desc.access = PL_DESC_ACCESS_READWRITE; + out->memory = PL_MEMORY_COHERENT; + continue; + } + + PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + } + + if (!params.format) { + PL_ERR(gpu, "No FORMAT specified!"); + return false; + } + + int caps = params.format->caps; + if (out->binding.sample_mode == PL_TEX_SAMPLE_LINEAR && !(caps & PL_FMT_CAP_LINEAR)) { + PL_ERR(gpu, "The specified texture format cannot be linear filtered!"); + return false; + } + + // Decode the rest of the section (up to the next //! marker) as raw hex + // data for the texture + pl_str tex, hexdata = split_magic(body); + if (!pl_str_decode_hex(NULL, pl_str_strip(hexdata), &tex)) { + PL_ERR(gpu, "Error while parsing TEXTURE body: must be a valid " + "hexadecimal sequence!"); + return false; + } + + int texels = params.w * PL_DEF(params.h, 1) * PL_DEF(params.d, 1); + size_t expected_len = texels * params.format->texel_size; + if (tex.len == 0 && params.storable) { + // In this case, it's okay that the texture has no initial data + pl_free_ptr(&tex.buf); + } else if (tex.len != expected_len) { + PL_ERR(gpu, "Shader TEXTURE size mismatch: got %zu bytes, expected %zu!", + tex.len, expected_len); + pl_free(tex.buf); + return false; + } + + params.initial_data = tex.buf; + out->binding.object = pl_tex_create(gpu, ¶ms); + pl_free(tex.buf); + + if (!out->binding.object) { + PL_ERR(gpu, "Failed creating custom texture!"); + return false; + } + + return true; +} + +static bool parse_buf(pl_gpu gpu, void *alloc, pl_str *body, + struct pl_shader_desc *out) +{ + *out = (struct pl_shader_desc) { + .desc = { + .name = "USER_BUF", + .type = PL_DESC_BUF_UNIFORM, + }, + }; + + // Temporary, to allow deferring variable placement until all headers + // have been processed (in order to e.g. determine buffer type) + void *tmp = pl_tmp(alloc); // will be freed automatically on failure + PL_ARRAY(struct pl_var) vars = {0}; + + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + if (pl_str_eatstart0(&line, "BUFFER")) { + out->desc.name = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "STORAGE")) { + out->desc.type = PL_DESC_BUF_STORAGE; + out->desc.access = PL_DESC_ACCESS_READWRITE; + out->memory = PL_MEMORY_COHERENT; + continue; + } + + if (pl_str_eatstart0(&line, "VAR")) { + pl_str type_name = pl_str_split_char(pl_str_strip(line), ' ', &line); + struct pl_var var = {0}; + for (const struct pl_named_var *nv = pl_var_glsl_types; nv->glsl_name; nv++) { + if (pl_str_equals0(type_name, nv->glsl_name)) { + var = nv->var; + break; + } + } + + if (!var.type) { + // No type found + PL_ERR(gpu, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(type_name)); + return false; + } + + pl_str var_name = pl_str_split_char(line, '[', &line); + if (line.len > 0) { + // Parse array dimension + if (!pl_str_parse_int(pl_str_split_char(line, ']', NULL), &var.dim_a)) { + PL_ERR(gpu, "Failed parsing array dimension from [%.*s!", + PL_STR_FMT(line)); + return false; + } + + if (var.dim_a < 1) { + PL_ERR(gpu, "Invalid array dimension %d!", var.dim_a); + return false; + } + } + + var.name = pl_strdup0(alloc, pl_str_strip(var_name)); + PL_ARRAY_APPEND(tmp, vars, var); + continue; + } + + PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + } + + // Try placing all of the buffer variables + for (int i = 0; i < vars.num; i++) { + if (!sh_buf_desc_append(alloc, gpu, out, NULL, vars.elem[i])) { + PL_ERR(gpu, "Custom buffer exceeds GPU limitations!"); + return false; + } + } + + // Decode the rest of the section (up to the next //! marker) as raw hex + // data for the buffer + pl_str data, hexdata = split_magic(body); + if (!pl_str_decode_hex(tmp, pl_str_strip(hexdata), &data)) { + PL_ERR(gpu, "Error while parsing BUFFER body: must be a valid " + "hexadecimal sequence!"); + return false; + } + + size_t buf_size = sh_buf_desc_size(out); + if (data.len == 0 && out->desc.type == PL_DESC_BUF_STORAGE) { + // In this case, it's okay that the buffer has no initial data + } else if (data.len != buf_size) { + PL_ERR(gpu, "Shader BUFFER size mismatch: got %zu bytes, expected %zu!", + data.len, buf_size); + return false; + } + + out->binding.object = pl_buf_create(gpu, pl_buf_params( + .size = buf_size, + .uniform = out->desc.type == PL_DESC_BUF_UNIFORM, + .storable = out->desc.type == PL_DESC_BUF_STORAGE, + .initial_data = data.len ? data.buf : NULL, + )); + + if (!out->binding.object) { + PL_ERR(gpu, "Failed creating custom buffer!"); + return false; + } + + pl_free(tmp); + return true; +} + +static bool parse_var(pl_log log, pl_str str, enum pl_var_type type, pl_var_data *out) +{ + if (!str.len) + return true; + + pl_str buf = str; + bool ok = false; + switch (type) { + case PL_VAR_SINT: + ok = pl_str_parse_int(pl_str_split_char(buf, ' ', &buf), &out->i); + break; + case PL_VAR_UINT: + ok = pl_str_parse_uint(pl_str_split_char(buf, ' ', &buf), &out->u); + break; + case PL_VAR_FLOAT: + ok = pl_str_parse_float(pl_str_split_char(buf, ' ', &buf), &out->f); + break; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + + if (pl_str_strip(buf).len > 0) + ok = false; // left-over garbage + + if (!ok) { + pl_err(log, "Failed parsing variable data: %.*s", PL_STR_FMT(str)); + return false; + } + + return true; +} + +static bool check_bounds(pl_log log, enum pl_var_type type, const pl_var_data data, + const pl_var_data minimum, const pl_var_data maximum) +{ +#define CHECK_BOUNDS(v, fmt) do \ +{ \ + if (data.v < minimum.v) { \ + pl_err(log, "Initial value "fmt" below declared minimum "fmt"!", \ + data.v, minimum.v); \ + return false; \ + } \ + if (data.v > maximum.v) { \ + pl_err(log, "Initial value "fmt" above declared maximum "fmt"!", \ + data.v, maximum.v); \ + return false; \ + } \ +} while (0) + + switch (type) { + case PL_VAR_SINT: + CHECK_BOUNDS(i, "%d"); + break; + case PL_VAR_UINT: + CHECK_BOUNDS(u, "%u"); + break; + case PL_VAR_FLOAT: + CHECK_BOUNDS(f, "%f"); + break; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + +#undef CHECK_BOUNDS + return true; +} + +static bool parse_param(pl_log log, void *alloc, pl_str *body, + struct pl_hook_par *out) +{ + *out = (struct pl_hook_par) {0}; + pl_str minimum = {0}; + pl_str maximum = {0}; + bool is_enum = false; + + while (true) { + pl_str rest; + pl_str line = pl_str_strip(pl_str_getline(*body, &rest)); + + if (!pl_str_eatstart0(&line, "//!")) + break; + + *body = rest; + + if (pl_str_eatstart0(&line, "PARAM")) { + out->name = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "DESC")) { + out->description = pl_strdup0(alloc, pl_str_strip(line)); + continue; + } + + if (pl_str_eatstart0(&line, "MINIMUM")) { + minimum = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "MAXIMUM")) { + maximum = pl_str_strip(line); + continue; + } + + if (pl_str_eatstart0(&line, "TYPE")) { + line = pl_str_strip(line); + is_enum = pl_str_eatstart0(&line, "ENUM"); + line = pl_str_strip(line); + if (pl_str_eatstart0(&line, "DYNAMIC")) { + out->mode = PL_HOOK_PAR_DYNAMIC; + } else if (pl_str_eatstart0(&line, "CONSTANT")) { + out->mode = PL_HOOK_PAR_CONSTANT; + } else if (pl_str_eatstart0(&line, "DEFINE")) { + out->mode = PL_HOOK_PAR_DEFINE; + out->type = PL_VAR_SINT; + if (pl_str_strip(line).len > 0) { + pl_err(log, "TYPE DEFINE does not take any extra arguments, " + "unexpected: '%.*s'", PL_STR_FMT(line)); + return false; + } + continue; + } else { + out->mode = PL_HOOK_PAR_VARIABLE; + } + + line = pl_str_strip(line); + for (const struct pl_named_var *nv = pl_var_glsl_types; + nv->glsl_name; nv++) + { + if (pl_str_equals0(line, nv->glsl_name)) { + if (nv->var.dim_v > 1 || nv->var.dim_m > 1) { + pl_err(log, "GLSL type '%s' is incompatible with " + "shader parameters, must be scalar type!", + nv->glsl_name); + return false; + } + + out->type = nv->var.type; + if (is_enum && out->type != PL_VAR_SINT) { + pl_err(log, "ENUM is only compatible with type int/DEFINE!"); + return false; + } + goto next; + } + } + + pl_err(log, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(line)); + return false; + } + + pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line)); + return false; + +next: ; + } + + switch (out->type) { + case PL_VAR_INVALID: + pl_err(log, "Missing variable type!"); + return false; + case PL_VAR_SINT: + out->minimum.i = INT_MIN; + out->maximum.i = INT_MAX; + break; + case PL_VAR_UINT: + out->minimum.u = 0; + out->maximum.u = UINT_MAX; + break; + case PL_VAR_FLOAT: + out->minimum.f = -INFINITY; + out->maximum.f = INFINITY; + break; + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + + pl_str initial = pl_str_strip(split_magic(body)); + if (!initial.len) { + pl_err(log, "Missing initial parameter value!"); + return false; + } + + if (is_enum) { + PL_ARRAY(const char *) names = {0}; + pl_assert(out->type == PL_VAR_SINT); + do { + pl_str line = pl_str_strip(pl_str_getline(initial, &initial)); + if (!line.len) + continue; + PL_ARRAY_APPEND(alloc, names, pl_strdup0(alloc, line)); + } while (initial.len); + + pl_assert(names.num >= 1); + out->initial.i = 0; + out->minimum.i = 0; + out->maximum.i = names.num - 1; + out->names = names.elem; + } else { + if (!parse_var(log, initial, out->type, &out->initial)) + return false; + if (!parse_var(log, minimum, out->type, &out->minimum)) + return false; + if (!parse_var(log, maximum, out->type, &out->maximum)) + return false; + if (!check_bounds(log, out->type, out->initial, out->minimum, out->maximum)) + return false; + } + + out->data = pl_memdup(alloc, &out->initial, sizeof(out->initial)); + return true; +} + +static enum pl_hook_stage mp_stage_to_pl(pl_str stage) +{ + if (pl_str_equals0(stage, "RGB")) + return PL_HOOK_RGB_INPUT; + if (pl_str_equals0(stage, "LUMA")) + return PL_HOOK_LUMA_INPUT; + if (pl_str_equals0(stage, "CHROMA")) + return PL_HOOK_CHROMA_INPUT; + if (pl_str_equals0(stage, "ALPHA")) + return PL_HOOK_ALPHA_INPUT; + if (pl_str_equals0(stage, "XYZ")) + return PL_HOOK_XYZ_INPUT; + + if (pl_str_equals0(stage, "CHROMA_SCALED")) + return PL_HOOK_CHROMA_SCALED; + if (pl_str_equals0(stage, "ALPHA_SCALED")) + return PL_HOOK_ALPHA_SCALED; + + if (pl_str_equals0(stage, "NATIVE")) + return PL_HOOK_NATIVE; + if (pl_str_equals0(stage, "MAINPRESUB")) + return PL_HOOK_RGB; + if (pl_str_equals0(stage, "MAIN")) + return PL_HOOK_RGB; // Note: conflicts with above! + + if (pl_str_equals0(stage, "LINEAR")) + return PL_HOOK_LINEAR; + if (pl_str_equals0(stage, "SIGMOID")) + return PL_HOOK_SIGMOID; + if (pl_str_equals0(stage, "PREKERNEL")) + return PL_HOOK_PRE_KERNEL; + if (pl_str_equals0(stage, "POSTKERNEL")) + return PL_HOOK_POST_KERNEL; + + if (pl_str_equals0(stage, "SCALED")) + return PL_HOOK_SCALED; + if (pl_str_equals0(stage, "PREOUTPUT")) + return PL_HOOK_PRE_OUTPUT; + if (pl_str_equals0(stage, "OUTPUT")) + return PL_HOOK_OUTPUT; + + return 0; +} + +static pl_str pl_stage_to_mp(enum pl_hook_stage stage) +{ + switch (stage) { + case PL_HOOK_RGB_INPUT: return pl_str0("RGB"); + case PL_HOOK_LUMA_INPUT: return pl_str0("LUMA"); + case PL_HOOK_CHROMA_INPUT: return pl_str0("CHROMA"); + case PL_HOOK_ALPHA_INPUT: return pl_str0("ALPHA"); + case PL_HOOK_XYZ_INPUT: return pl_str0("XYZ"); + + case PL_HOOK_CHROMA_SCALED: return pl_str0("CHROMA_SCALED"); + case PL_HOOK_ALPHA_SCALED: return pl_str0("ALPHA_SCALED"); + + case PL_HOOK_NATIVE: return pl_str0("NATIVE"); + case PL_HOOK_RGB: return pl_str0("MAINPRESUB"); + + case PL_HOOK_LINEAR: return pl_str0("LINEAR"); + case PL_HOOK_SIGMOID: return pl_str0("SIGMOID"); + case PL_HOOK_PRE_KERNEL: return pl_str0("PREKERNEL"); + case PL_HOOK_POST_KERNEL: return pl_str0("POSTKERNEL"); + + case PL_HOOK_SCALED: return pl_str0("SCALED"); + case PL_HOOK_PRE_OUTPUT: return pl_str0("PREOUTPUT"); + case PL_HOOK_OUTPUT: return pl_str0("OUTPUT"); + }; + + pl_unreachable(); +} + +struct hook_pass { + enum pl_hook_stage exec_stages; + struct custom_shader_hook hook; +}; + +struct pass_tex { + pl_str name; + pl_tex tex; + + // Metadata + pl_rect2df rect; + struct pl_color_repr repr; + struct pl_color_space color; + int comps; +}; + +struct hook_priv { + pl_log log; + pl_gpu gpu; + void *alloc; + + PL_ARRAY(struct hook_pass) hook_passes; + PL_ARRAY(struct pl_hook_par) hook_params; + + // Fixed (for shader-local resources) + PL_ARRAY(struct pl_shader_desc) descriptors; + + // Dynamic per pass + enum pl_hook_stage save_stages; + PL_ARRAY(struct pass_tex) pass_textures; + pl_shader trc_helper; + + // State for PRNG/frame count + int frame_count; + uint64_t prng_state[4]; +}; + +static void hook_reset(void *priv) +{ + struct hook_priv *p = priv; + p->pass_textures.num = 0; +} + +// Context during execution of a hook +struct hook_ctx { + struct hook_priv *priv; + const struct pl_hook_params *params; + struct pass_tex hooked; +}; + +static bool lookup_tex(struct hook_ctx *ctx, pl_str var, float size[2]) +{ + struct hook_priv *p = ctx->priv; + const struct pl_hook_params *params = ctx->params; + + if (pl_str_equals0(var, "HOOKED")) { + pl_assert(ctx->hooked.tex); + size[0] = ctx->hooked.tex->params.w; + size[1] = ctx->hooked.tex->params.h; + return true; + } + + if (pl_str_equals0(var, "NATIVE_CROPPED")) { + size[0] = fabs(pl_rect_w(params->src_rect)); + size[1] = fabs(pl_rect_h(params->src_rect)); + return true; + } + + if (pl_str_equals0(var, "OUTPUT")) { + size[0] = abs(pl_rect_w(params->dst_rect)); + size[1] = abs(pl_rect_h(params->dst_rect)); + return true; + } + + if (pl_str_equals0(var, "MAIN")) + var = pl_str0("MAINPRESUB"); + + for (int i = 0; i < p->pass_textures.num; i++) { + if (pl_str_equals(var, p->pass_textures.elem[i].name)) { + pl_tex tex = p->pass_textures.elem[i].tex; + size[0] = tex->params.w; + size[1] = tex->params.h; + return true; + } + } + + return false; +} + +static bool lookup_var(struct hook_ctx *ctx, pl_str var, float *val) +{ + struct hook_priv *p = ctx->priv; + for (int i = 0; i < p->hook_params.num; i++) { + const struct pl_hook_par *hp = &p->hook_params.elem[i]; + if (pl_str_equals0(var, hp->name)) { + switch (hp->type) { + case PL_VAR_SINT: *val = hp->data->i; return true; + case PL_VAR_UINT: *val = hp->data->u; return true; + case PL_VAR_FLOAT: *val = hp->data->f; return true; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + break; + } + + pl_unreachable(); + } + + if (hp->names) { + for (int j = hp->minimum.i; j <= hp->maximum.i; j++) { + if (pl_str_equals0(var, hp->names[j])) { + *val = j; + return true; + } + } + } + } + + PL_WARN(p, "Variable '%.*s' not found in RPN expression!", PL_STR_FMT(var)); + return false; +} + +// Returns whether successful. 'result' is left untouched on failure +static bool eval_shexpr(struct hook_ctx *ctx, + const struct shexp expr[MAX_SHEXP_SIZE], + float *result) +{ + struct hook_priv *p = ctx->priv; + float stack[MAX_SHEXP_SIZE] = {0}; + int idx = 0; // points to next element to push + + for (int i = 0; i < MAX_SHEXP_SIZE; i++) { + switch (expr[i].tag) { + case SHEXP_END: + goto done; + + case SHEXP_CONST: + // Since our SHEXPs are bound by MAX_SHEXP_SIZE, it should be + // impossible to overflow the stack + assert(idx < MAX_SHEXP_SIZE); + stack[idx++] = expr[i].val.cval; + continue; + + case SHEXP_OP1: + if (idx < 1) { + PL_WARN(p, "Stack underflow in RPN expression!"); + return false; + } + + switch (expr[i].val.op) { + case SHEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break; + default: pl_unreachable(); + } + continue; + + case SHEXP_OP2: + if (idx < 2) { + PL_WARN(p, "Stack underflow in RPN expression!"); + return false; + } + + // Pop the operands in reverse order + float op2 = stack[--idx]; + float op1 = stack[--idx]; + float res = 0.0; + switch (expr[i].val.op) { + case SHEXP_OP_ADD: res = op1 + op2; break; + case SHEXP_OP_SUB: res = op1 - op2; break; + case SHEXP_OP_MUL: res = op1 * op2; break; + case SHEXP_OP_DIV: res = op1 / op2; break; + case SHEXP_OP_MOD: res = fmodf(op1, op2); break; + case SHEXP_OP_GT: res = op1 > op2; break; + case SHEXP_OP_LT: res = op1 < op2; break; + case SHEXP_OP_EQ: res = fabsf(op1 - op2) <= 1e-6 * fmaxf(op1, op2); break; + case SHEXP_OP_NOT: pl_unreachable(); + } + + if (!isfinite(res)) { + PL_WARN(p, "Illegal operation in RPN expression!"); + return false; + } + + stack[idx++] = res; + continue; + + case SHEXP_TEX_W: + case SHEXP_TEX_H: { + pl_str name = expr[i].val.varname; + float size[2]; + + if (!lookup_tex(ctx, name, size)) { + PL_WARN(p, "Variable '%.*s' not found in RPN expression!", + PL_STR_FMT(name)); + return false; + } + + stack[idx++] = (expr[i].tag == SHEXP_TEX_W) ? size[0] : size[1]; + continue; + } + + case SHEXP_VAR: { + pl_str name = expr[i].val.varname; + float val; + if (!lookup_var(ctx, name, &val)) + return false; + stack[idx++] = val; + continue; + } + } + } + +done: + // Return the single stack element + if (idx != 1) { + PL_WARN(p, "Malformed stack after RPN expression!"); + return false; + } + + *result = stack[0]; + return true; +} + +static double prng_step(uint64_t s[4]) +{ + const uint64_t result = s[0] + s[3]; + const uint64_t t = s[1] << 17; + + s[2] ^= s[0]; + s[3] ^= s[1]; + s[1] ^= s[2]; + s[0] ^= s[3]; + + s[2] ^= t; + s[3] = (s[3] << 45) | (s[3] >> (64 - 45)); + return (result >> 11) * 0x1.0p-53; +} + +static bool bind_pass_tex(pl_shader sh, pl_str name, + const struct pass_tex *ptex, + const pl_rect2df *rect, + bool hooked, bool mainpresub) +{ + ident_t id, pos, pt; + + // Compatibility with mpv texture binding semantics + id = sh_bind(sh, ptex->tex, PL_TEX_ADDRESS_CLAMP, PL_TEX_SAMPLE_LINEAR, + "hook_tex", rect, &pos, &pt); + if (!id) + return false; + + GLSLH("#define %.*s_raw "$" \n", PL_STR_FMT(name), id); + GLSLH("#define %.*s_pos "$" \n", PL_STR_FMT(name), pos); + GLSLH("#define %.*s_map "$"_map \n", PL_STR_FMT(name), pos); + GLSLH("#define %.*s_size vec2(textureSize("$", 0)) \n", PL_STR_FMT(name), id); + GLSLH("#define %.*s_pt "$" \n", PL_STR_FMT(name), pt); + + float off[2] = { ptex->rect.x0, ptex->rect.y0 }; + GLSLH("#define %.*s_off "$" \n", PL_STR_FMT(name), + sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("offset"), + .data = off, + })); + + struct pl_color_repr repr = ptex->repr; + ident_t scale = SH_FLOAT(pl_color_repr_normalize(&repr)); + GLSLH("#define %.*s_mul "$" \n", PL_STR_FMT(name), scale); + + // Compatibility with mpv + GLSLH("#define %.*s_rot mat2(1.0, 0.0, 0.0, 1.0) \n", PL_STR_FMT(name)); + + // Sampling function boilerplate + GLSLH("#define %.*s_tex(pos) ("$" * vec4(textureLod("$", pos, 0.0))) \n", + PL_STR_FMT(name), scale, id); + GLSLH("#define %.*s_texOff(off) (%.*s_tex("$" + "$" * vec2(off))) \n", + PL_STR_FMT(name), PL_STR_FMT(name), pos, pt); + + bool can_gather = ptex->tex->params.format->gatherable; + if (can_gather) { + GLSLH("#define %.*s_gather(pos, c) ("$" * vec4(textureGather("$", pos, c))) \n", + PL_STR_FMT(name), scale, id); + } + + if (hooked) { + GLSLH("#define HOOKED_raw %.*s_raw \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_pos %.*s_pos \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_size %.*s_size \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_rot %.*s_rot \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_off %.*s_off \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_pt %.*s_pt \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_map %.*s_map \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_mul %.*s_mul \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_tex %.*s_tex \n", PL_STR_FMT(name)); + GLSLH("#define HOOKED_texOff %.*s_texOff \n", PL_STR_FMT(name)); + if (can_gather) + GLSLH("#define HOOKED_gather %.*s_gather \n", PL_STR_FMT(name)); + } + + if (mainpresub) { + GLSLH("#define MAIN_raw MAINPRESUB_raw \n"); + GLSLH("#define MAIN_pos MAINPRESUB_pos \n"); + GLSLH("#define MAIN_size MAINPRESUB_size \n"); + GLSLH("#define MAIN_rot MAINPRESUB_rot \n"); + GLSLH("#define MAIN_off MAINPRESUB_off \n"); + GLSLH("#define MAIN_pt MAINPRESUB_pt \n"); + GLSLH("#define MAIN_map MAINPRESUB_map \n"); + GLSLH("#define MAIN_mul MAINPRESUB_mul \n"); + GLSLH("#define MAIN_tex MAINPRESUB_tex \n"); + GLSLH("#define MAIN_texOff MAINPRESUB_texOff \n"); + if (can_gather) + GLSLH("#define MAIN_gather MAINPRESUB_gather \n"); + } + + return true; +} + +static void save_pass_tex(struct hook_priv *p, struct pass_tex ptex) +{ + + for (int i = 0; i < p->pass_textures.num; i++) { + if (!pl_str_equals(p->pass_textures.elem[i].name, ptex.name)) + continue; + + p->pass_textures.elem[i] = ptex; + return; + } + + // No texture with this name yet, append new one + PL_ARRAY_APPEND(p->alloc, p->pass_textures, ptex); +} + +static struct pl_hook_res hook_hook(void *priv, const struct pl_hook_params *params) +{ + struct hook_priv *p = priv; + pl_str stage = pl_stage_to_mp(params->stage); + struct pl_hook_res res = {0}; + + pl_shader sh = NULL; + struct hook_ctx ctx = { + .priv = p, + .params = params, + .hooked = { + .name = stage, + .tex = params->tex, + .rect = params->rect, + .repr = params->repr, + .color = params->color, + .comps = params->components, + }, + }; + + // Save the input texture if needed + if (p->save_stages & params->stage) { + PL_TRACE(p, "Saving input texture '%.*s' for binding", + PL_STR_FMT(ctx.hooked.name)); + save_pass_tex(p, ctx.hooked); + } + + for (int n = 0; n < p->hook_passes.num; n++) { + const struct hook_pass *pass = &p->hook_passes.elem[n]; + if (!(pass->exec_stages & params->stage)) + continue; + + const struct custom_shader_hook *hook = &pass->hook; + PL_TRACE(p, "Executing hook pass %d on stage '%.*s': %.*s", + n, PL_STR_FMT(stage), PL_STR_FMT(hook->pass_desc)); + + // Test for execution condition + float run = 0; + if (!eval_shexpr(&ctx, hook->cond, &run)) + goto error; + + if (!run) { + PL_TRACE(p, "Skipping hook due to condition"); + continue; + } + + // Generate a new shader object + sh = pl_dispatch_begin(params->dispatch); + + // Bind all necessary input textures + for (int i = 0; i < PL_ARRAY_SIZE(hook->bind_tex); i++) { + pl_str texname = hook->bind_tex[i]; + if (!texname.len) + break; + + // Convenience alias, to allow writing shaders that are oblivious + // of the exact stage they hooked. This simply translates to + // whatever stage actually fired the hook. + bool hooked = false, mainpresub = false; + if (pl_str_equals0(texname, "HOOKED")) { + // Continue with binding this, under the new name + texname = stage; + hooked = true; + } + + // Compatibility alias, because MAIN and MAINPRESUB mean the same + // thing to libplacebo, but user shaders are still written as + // though they can be different concepts. + if (pl_str_equals0(texname, "MAIN") || + pl_str_equals0(texname, "MAINPRESUB")) + { + texname = pl_str0("MAINPRESUB"); + mainpresub = true; + } + + for (int j = 0; j < p->descriptors.num; j++) { + if (pl_str_equals0(texname, p->descriptors.elem[j].desc.name)) { + // Directly bind this, no need to bother with all the + // `bind_pass_tex` boilerplate + ident_t id = sh_desc(sh, p->descriptors.elem[j]); + GLSLH("#define %.*s "$" \n", PL_STR_FMT(texname), id); + + if (p->descriptors.elem[j].desc.type == PL_DESC_SAMPLED_TEX) { + GLSLH("#define %.*s_tex(pos) (textureLod("$", pos, 0.0)) \n", + PL_STR_FMT(texname), id); + } + goto next_bind; + } + } + + for (int j = 0; j < p->pass_textures.num; j++) { + if (pl_str_equals(texname, p->pass_textures.elem[j].name)) { + // Note: We bind the whole texture, rather than + // hooked.rect, because user shaders in general are not + // designed to handle cropped input textures. + const struct pass_tex *ptex = &p->pass_textures.elem[j]; + pl_rect2df rect = { + 0, 0, ptex->tex->params.w, ptex->tex->params.h, + }; + + if (hook->offset_align && pl_str_equals(texname, stage)) { + float sx = pl_rect_w(ctx.hooked.rect) / pl_rect_w(params->src_rect), + sy = pl_rect_h(ctx.hooked.rect) / pl_rect_h(params->src_rect), + ox = ctx.hooked.rect.x0 - sx * params->src_rect.x0, + oy = ctx.hooked.rect.y0 - sy * params->src_rect.y0; + + PL_TRACE(p, "Aligning plane with ref: %f %f", ox, oy); + pl_rect2df_offset(&rect, ox, oy); + } + + if (!bind_pass_tex(sh, texname, &p->pass_textures.elem[j], + &rect, hooked, mainpresub)) + { + goto error; + } + goto next_bind; + } + } + + // If none of the above matched, this is an unknown texture name, + // so silently ignore this pass to match the mpv behavior + PL_TRACE(p, "Skipping hook due to no texture named '%.*s'.", + PL_STR_FMT(texname)); + pl_dispatch_abort(params->dispatch, &sh); + goto next_pass; + + next_bind: ; // outer 'continue' + } + + // Set up the input variables + p->frame_count++; + GLSLH("#define frame "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_int("frame"), + .data = &p->frame_count, + .dynamic = true, + })); + + float random = prng_step(p->prng_state); + GLSLH("#define random "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("random"), + .data = &random, + .dynamic = true, + })); + + float src_size[2] = { pl_rect_w(params->src_rect), pl_rect_h(params->src_rect) }; + GLSLH("#define input_size "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("input_size"), + .data = src_size, + })); + + float dst_size[2] = { pl_rect_w(params->dst_rect), pl_rect_h(params->dst_rect) }; + GLSLH("#define target_size "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("target_size"), + .data = dst_size, + })); + + float tex_off[2] = { params->src_rect.x0, params->src_rect.y0 }; + GLSLH("#define tex_offset "$" \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tex_offset"), + .data = tex_off, + })); + + // Custom parameters + for (int i = 0; i < p->hook_params.num; i++) { + const struct pl_hook_par *hp = &p->hook_params.elem[i]; + switch (hp->mode) { + case PL_HOOK_PAR_VARIABLE: + case PL_HOOK_PAR_DYNAMIC: + GLSLH("#define %s "$" \n", hp->name, + sh_var(sh, (struct pl_shader_var) { + .var = { + .name = hp->name, + .type = hp->type, + .dim_v = 1, + .dim_m = 1, + .dim_a = 1, + }, + .data = hp->data, + .dynamic = hp->mode == PL_HOOK_PAR_DYNAMIC, + })); + break; + + case PL_HOOK_PAR_CONSTANT: + GLSLH("#define %s "$" \n", hp->name, + sh_const(sh, (struct pl_shader_const) { + .name = hp->name, + .type = hp->type, + .data = hp->data, + .compile_time = true, + })); + break; + + case PL_HOOK_PAR_DEFINE: + GLSLH("#define %s %d \n", hp->name, hp->data->i); + break; + + case PL_HOOK_PAR_MODE_COUNT: + pl_unreachable(); + } + + if (hp->names) { + for (int j = hp->minimum.i; j <= hp->maximum.i; j++) + GLSLH("#define %s %d \n", hp->names[j], j); + } + } + + // Helper sub-shaders + uint64_t sh_id = SH_PARAMS(sh).id; + pl_shader_reset(p->trc_helper, pl_shader_params( + .id = ++sh_id, + .gpu = p->gpu, + )); + pl_shader_linearize(p->trc_helper, params->orig_color); + GLSLH("#define linearize "$" \n", sh_subpass(sh, p->trc_helper)); + + pl_shader_reset(p->trc_helper, pl_shader_params( + .id = ++sh_id, + .gpu = p->gpu, + )); + pl_shader_delinearize(p->trc_helper, params->orig_color); + GLSLH("#define delinearize "$" \n", sh_subpass(sh, p->trc_helper)); + + // Load and run the user shader itself + sh_append_str(sh, SH_BUF_HEADER, hook->pass_body); + sh_describef(sh, "%.*s", PL_STR_FMT(hook->pass_desc)); + + // Resolve output size and create framebuffer + float out_size[2] = {0}; + if (!eval_shexpr(&ctx, hook->width, &out_size[0]) || + !eval_shexpr(&ctx, hook->height, &out_size[1])) + { + goto error; + } + + int out_w = roundf(out_size[0]), + out_h = roundf(out_size[1]); + + if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h)) + goto error; + + // Generate a new texture to store the render result + pl_tex fbo; + fbo = params->get_tex(params->priv, out_w, out_h); + if (!fbo) { + PL_ERR(p, "Failed dispatching hook: `get_tex` callback failed?"); + goto error; + } + + bool ok; + if (hook->is_compute) { + + if (!sh_try_compute(sh, hook->threads_w, hook->threads_h, false, 0) || + !fbo->params.storable) + { + PL_ERR(p, "Failed dispatching COMPUTE shader"); + goto error; + } + + GLSLP("#define out_image "$" \n", sh_desc(sh, (struct pl_shader_desc) { + .binding.object = fbo, + .desc = { + .name = "out_image", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + })); + + sh->output = PL_SHADER_SIG_NONE; + + GLSL("hook(); \n"); + ok = pl_dispatch_compute(params->dispatch, pl_dispatch_compute_params( + .shader = &sh, + .dispatch_size = { + // Round up as many blocks as are needed to cover the image + PL_DIV_UP(out_w, hook->block_w), + PL_DIV_UP(out_h, hook->block_h), + 1, + }, + .width = out_w, + .height = out_h, + )); + + } else { + + // Default non-COMPUTE shaders to explicitly use fragment shaders + // only, to avoid breaking things like fwidth() + sh->type = PL_DEF(sh->type, SH_FRAGMENT); + + GLSL("vec4 color = hook(); \n"); + ok = pl_dispatch_finish(params->dispatch, pl_dispatch_params( + .shader = &sh, + .target = fbo, + )); + + } + + if (!ok) + goto error; + + float sx = (float) out_w / ctx.hooked.tex->params.w, + sy = (float) out_h / ctx.hooked.tex->params.h, + x0 = sx * ctx.hooked.rect.x0 + hook->offset[0], + y0 = sy * ctx.hooked.rect.y0 + hook->offset[1]; + + pl_rect2df new_rect = { + x0, + y0, + x0 + sx * pl_rect_w(ctx.hooked.rect), + y0 + sy * pl_rect_h(ctx.hooked.rect), + }; + + if (hook->offset_align) { + float rx = pl_rect_w(new_rect) / pl_rect_w(params->src_rect), + ry = pl_rect_h(new_rect) / pl_rect_h(params->src_rect), + ox = rx * params->src_rect.x0 - sx * ctx.hooked.rect.x0, + oy = ry * params->src_rect.y0 - sy * ctx.hooked.rect.y0; + + pl_rect2df_offset(&new_rect, ox, oy); + } + + // Save the result of this shader invocation + struct pass_tex ptex = { + .name = hook->save_tex.len ? hook->save_tex : stage, + .tex = fbo, + .repr = ctx.hooked.repr, + .color = ctx.hooked.color, + .comps = PL_DEF(hook->comps, ctx.hooked.comps), + .rect = new_rect, + }; + + // It's assumed that users will correctly normalize the input + pl_color_repr_normalize(&ptex.repr); + + PL_TRACE(p, "Saving output texture '%.*s' from hook execution on '%.*s'", + PL_STR_FMT(ptex.name), PL_STR_FMT(stage)); + + save_pass_tex(p, ptex); + + // Update the result object, unless we saved to a different name + if (pl_str_equals(ptex.name, stage)) { + ctx.hooked = ptex; + res = (struct pl_hook_res) { + .output = PL_HOOK_SIG_TEX, + .tex = fbo, + .repr = ptex.repr, + .color = ptex.color, + .components = ptex.comps, + .rect = new_rect, + }; + } + +next_pass: ; + } + + return res; + +error: + pl_dispatch_abort(params->dispatch, &sh); + return (struct pl_hook_res) { .failed = true }; +} + +const struct pl_hook *pl_mpv_user_shader_parse(pl_gpu gpu, + const char *shader_text, + size_t shader_len) +{ + if (!shader_len) + return NULL; + + pl_str shader = { (uint8_t *) shader_text, shader_len }; + + struct pl_hook *hook = pl_zalloc_obj(NULL, hook, struct hook_priv); + struct hook_priv *p = PL_PRIV(hook); + + *hook = (struct pl_hook) { + .input = PL_HOOK_SIG_TEX, + .priv = p, + .reset = hook_reset, + .hook = hook_hook, + .signature = pl_str_hash(shader), + }; + + *p = (struct hook_priv) { + .log = gpu->log, + .gpu = gpu, + .alloc = hook, + .trc_helper = pl_shader_alloc(gpu->log, NULL), + .prng_state = { + // Determined by fair die roll + 0xb76d71f9443c228allu, 0x93a02092fc4807e8llu, + 0x06d81748f838bd07llu, 0x9381ee129dddce6cllu, + }, + }; + + shader = pl_strdup(hook, shader); + + // Skip all garbage (e.g. comments) before the first header + int pos = pl_str_find(shader, pl_str0("//!")); + if (pos < 0) { + PL_ERR(gpu, "Shader appears to contain no headers?"); + goto error; + } + shader = pl_str_drop(shader, pos); + + // Loop over the file + while (shader.len > 0) + { + // Peek at the first header to dispatch the right type + if (pl_str_startswith0(shader, "//!TEXTURE")) { + struct pl_shader_desc sd; + if (!parse_tex(gpu, hook, &shader, &sd)) + goto error; + + PL_INFO(gpu, "Registering named texture '%s'", sd.desc.name); + PL_ARRAY_APPEND(hook, p->descriptors, sd); + continue; + } + + if (pl_str_startswith0(shader, "//!BUFFER")) { + struct pl_shader_desc sd; + if (!parse_buf(gpu, hook, &shader, &sd)) + goto error; + + PL_INFO(gpu, "Registering named buffer '%s'", sd.desc.name); + PL_ARRAY_APPEND(hook, p->descriptors, sd); + continue; + } + + if (pl_str_startswith0(shader, "//!PARAM")) { + struct pl_hook_par hp; + if (!parse_param(gpu->log, hook, &shader, &hp)) + goto error; + + PL_INFO(gpu, "Registering named parameter '%s'", hp.name); + PL_ARRAY_APPEND(hook, p->hook_params, hp); + continue; + } + + struct custom_shader_hook h; + if (!parse_hook(gpu->log, &shader, &h)) + goto error; + + struct hook_pass pass = { + .exec_stages = 0, + .hook = h, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(h.hook_tex); i++) + pass.exec_stages |= mp_stage_to_pl(h.hook_tex[i]); + for (int i = 0; i < PL_ARRAY_SIZE(h.bind_tex); i++) { + p->save_stages |= mp_stage_to_pl(h.bind_tex[i]); + if (pl_str_equals0(h.bind_tex[i], "HOOKED")) + p->save_stages |= pass.exec_stages; + } + + // As an extra precaution, this avoids errors when trying to run + // conditions against planes that were never hooked. As a sole + // exception, OUTPUT is special because it's hard-coded to return the + // dst_rect even before it was hooked. (This is an apparently + // undocumented mpv quirk, but shaders rely on it in practice) + enum pl_hook_stage rpn_stages = 0; + for (int i = 0; i < PL_ARRAY_SIZE(h.width); i++) { + if (h.width[i].tag == SHEXP_TEX_W || h.width[i].tag == SHEXP_TEX_H) + rpn_stages |= mp_stage_to_pl(h.width[i].val.varname); + } + for (int i = 0; i < PL_ARRAY_SIZE(h.height); i++) { + if (h.height[i].tag == SHEXP_TEX_W || h.height[i].tag == SHEXP_TEX_H) + rpn_stages |= mp_stage_to_pl(h.height[i].val.varname); + } + for (int i = 0; i < PL_ARRAY_SIZE(h.cond); i++) { + if (h.cond[i].tag == SHEXP_TEX_W || h.cond[i].tag == SHEXP_TEX_H) + rpn_stages |= mp_stage_to_pl(h.cond[i].val.varname); + } + + p->save_stages |= rpn_stages & ~PL_HOOK_OUTPUT; + + PL_INFO(gpu, "Registering hook pass: %.*s", PL_STR_FMT(h.pass_desc)); + PL_ARRAY_APPEND(hook, p->hook_passes, pass); + } + + // We need to hook on both the exec and save stages, so that we can keep + // track of any textures we might need + hook->stages |= p->save_stages; + for (int i = 0; i < p->hook_passes.num; i++) + hook->stages |= p->hook_passes.elem[i].exec_stages; + + hook->parameters = p->hook_params.elem; + hook->num_parameters = p->hook_params.num; + + PL_MSG(gpu, PL_LOG_DEBUG, "Loaded user shader:"); + pl_msg_source(gpu->log, PL_LOG_DEBUG, shader_text); + + return hook; + +error: + pl_mpv_user_shader_destroy((const struct pl_hook **) &hook); + PL_MSG(gpu, PL_LOG_ERR, "Failed to parse user shader:"); + pl_msg_source(gpu->log, PL_LOG_ERR, shader_text); + pl_log_stack_trace(gpu->log, PL_LOG_ERR); + return NULL; +} + +void pl_mpv_user_shader_destroy(const struct pl_hook **hookp) +{ + const struct pl_hook *hook = *hookp; + if (!hook) + return; + + struct hook_priv *p = PL_PRIV(hook); + for (int i = 0; i < p->descriptors.num; i++) { + switch (p->descriptors.elem[i].desc.type) { + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = p->descriptors.elem[i].binding.object; + pl_buf_destroy(p->gpu, &buf); + break; + } + + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: { + pl_tex tex = p->descriptors.elem[i].binding.object; + pl_tex_destroy(p->gpu, &tex); + break; + + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + pl_unreachable(); + } + } + } + + pl_shader_free(&p->trc_helper); + pl_free((void *) hook); + *hookp = NULL; +} diff --git a/src/shaders/deinterlacing.c b/src/shaders/deinterlacing.c new file mode 100644 index 0000000..5c85138 --- /dev/null +++ b/src/shaders/deinterlacing.c @@ -0,0 +1,260 @@ +/* + * This file is part of libplacebo, but also based on vf_yadif_cuda.cu: + * Copyright (C) 2018 Philip Langdale <philipl@overt.org> + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" + +#include <libplacebo/shaders/deinterlacing.h> + +const struct pl_deinterlace_params pl_deinterlace_default_params = { PL_DEINTERLACE_DEFAULTS }; + +void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src, + const struct pl_deinterlace_params *params) +{ + params = PL_DEF(params, &pl_deinterlace_default_params); + + const struct pl_tex_params *texparams = &src->cur.top->params; + if (!sh_require(sh, PL_SHADER_SIG_NONE, texparams->w, texparams->h)) + return; + + sh_describe(sh, "deinterlacing"); + GLSL("vec4 color = vec4(0,0,0,1); \n" + "// pl_shader_deinterlace \n" + "{ \n"); + + uint8_t comp_mask = PL_DEF(src->component_mask, 0xFu); + comp_mask &= (1u << texparams->format->num_components) - 1u; + if (!comp_mask) { + SH_FAIL(sh, "pl_shader_deinterlace: empty component mask?"); + return; + } + + const uint8_t num_comps = sh_num_comps(comp_mask); + const char *swiz = sh_swizzle(comp_mask); + GLSL("#define T %s \n", sh_float_type(comp_mask)); + + ident_t pos, pt; + ident_t cur = sh_bind(sh, src->cur.top, PL_TEX_ADDRESS_MIRROR, + PL_TEX_SAMPLE_NEAREST, "cur", NULL, &pos, &pt); + if (!cur) + return; + + GLSL("#define GET(TEX, X, Y) \\\n" + " (textureLod(TEX, pos + pt * vec2(X, Y), 0.0).%s) \n" + "vec2 pos = "$"; \n" + "vec2 pt = "$"; \n" + "T res; \n", + swiz, pos, pt); + + if (src->field == PL_FIELD_NONE) { + GLSL("res = GET("$", 0, 0); \n", cur); + goto done; + } + + // Don't modify the primary field + GLSL("int yh = textureSize("$", 0).y; \n" + "int yo = int("$".y * float(yh)); \n" + "if (yo %% 2 == %d) { \n" + " res = GET("$", 0, 0); \n" + "} else { \n", + cur, pos, + src->field == PL_FIELD_TOP ? 0 : 1, + cur); + + switch (params->algo) { + case PL_DEINTERLACE_WEAVE: + GLSL("res = GET("$", 0, 0); \n", cur); + break; + + case PL_DEINTERLACE_BOB: + GLSL("res = GET("$", 0, %d); \n", cur, + src->field == PL_FIELD_TOP ? -1 : 1); + break; + + + case PL_DEINTERLACE_YADIF: { + // Try using a compute shader for this, for the sole reason of + // optimizing for thread group synchronicity. Otherwise, because we + // alternate between lines output as-is and lines output deinterlaced, + // half of our thread group will be mostly idle at any point in time. + const int bw = PL_DEF(sh_glsl(sh).subgroup_size, 32); + sh_try_compute(sh, bw, 1, true, 0); + + // This magic constant is hard-coded in the original implementation as + // '1' on an 8-bit scale. Since we work with arbitrary bit depth + // floating point textures, we have to convert this somehow. Hard-code + // it as 1/255 under the assumption that the original intent was to be + // roughly 1 unit of brightness increment on an 8-bit source. This may + // or may not produce suboptimal results on higher-bit-depth content. + static const float spatial_bias = 1 / 255.0f; + + // Calculate spatial prediction + ident_t spatial_pred = sh_fresh(sh, "spatial_predictor"); + GLSLH("float "$"(float a, float b, float c, float d, float e, float f, float g, \n" + " float h, float i, float j, float k, float l, float m, float n) \n" + "{ \n" + " float spatial_pred = (d + k) / 2.0; \n" + " float spatial_score = abs(c - j) + abs(d - k) + abs(e - l) - %f; \n" + + " float score = abs(b - k) + abs(c - l) + abs(d - m); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (c + l) / 2.0; \n" + " spatial_score = score; \n" + " score = abs(a - l) + abs(b - m) + abs(c - n); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (b + m) / 2.0; \n" + " spatial_score = score; \n" + " } \n" + " } \n" + " score = abs(d - i) + abs(e - j) + abs(f - k); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (e + j) / 2.0; \n" + " spatial_score = score; \n" + " score = abs(e - h) + abs(f - i) + abs(g - j); \n" + " if (score < spatial_score) { \n" + " spatial_pred = (f + i) / 2.0; \n" + " spatial_score = score; \n" + " } \n" + " } \n" + " return spatial_pred; \n" + "} \n", + spatial_pred, spatial_bias); + + GLSL("T a = GET("$", -3, -1); \n" + "T b = GET("$", -2, -1); \n" + "T c = GET("$", -1, -1); \n" + "T d = GET("$", 0, -1); \n" + "T e = GET("$", +1, -1); \n" + "T f = GET("$", +2, -1); \n" + "T g = GET("$", +3, -1); \n" + "T h = GET("$", -3, +1); \n" + "T i = GET("$", -2, +1); \n" + "T j = GET("$", -1, +1); \n" + "T k = GET("$", 0, +1); \n" + "T l = GET("$", +1, +1); \n" + "T m = GET("$", +2, +1); \n" + "T n = GET("$", +3, +1); \n", + cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur); + + if (num_comps == 1) { + GLSL("res = "$"(a, b, c, d, e, f, g, h, i, j, k, l, m, n); \n", spatial_pred); + } else { + for (uint8_t i = 0; i < num_comps; i++) { + char c = "xyzw"[i]; + GLSL("res.%c = "$"(a.%c, b.%c, c.%c, d.%c, e.%c, f.%c, g.%c, \n" + " h.%c, i.%c, j.%c, k.%c, l.%c, m.%c, n.%c); \n", + c, spatial_pred, c, c, c, c, c, c, c, c, c, c, c, c, c, c); + } + } + + // Calculate temporal prediction + ident_t temporal_pred = sh_fresh(sh, "temporal_predictor"); + GLSLH("float "$"(float A, float B, float C, float D, float E, float F, \n" + " float G, float H, float I, float J, float K, float L, \n" + " float spatial_pred) \n" + "{ \n" + " float p0 = (C + H) / 2.0; \n" + " float p1 = F; \n" + " float p2 = (D + I) / 2.0; \n" + " float p3 = G; \n" + " float p4 = (E + J) / 2.0; \n" + + " float tdiff0 = abs(D - I) / 2.0; \n" + " float tdiff1 = (abs(A - F) + abs(B - G)) / 2.0; \n" + " float tdiff2 = (abs(K - F) + abs(G - L)) / 2.0; \n" + " float diff = max(tdiff0, max(tdiff1, tdiff2)); \n", + temporal_pred); + if (!params->skip_spatial_check) { + GLSLH("float maxi = max(p2 - min(p3, p1), min(p0 - p1, p4 - p3)); \n" + "float mini = min(p2 - max(p3, p1), max(p0 - p1, p4 - p3)); \n" + "diff = max(diff, max(mini, -maxi)); \n"); + } + GLSLH(" if (spatial_pred > p2 + diff) \n" + " spatial_pred = p2 + diff; \n" + " if (spatial_pred < p2 - diff) \n" + " spatial_pred = p2 - diff; \n" + " return spatial_pred; \n" + "} \n"); + + ident_t prev2 = cur, next2 = cur; + if (src->prev.top && src->prev.top != src->cur.top) { + pl_assert(src->prev.top->params.w == texparams->w); + pl_assert(src->prev.top->params.h == texparams->h); + prev2 = sh_bind(sh, src->prev.top, PL_TEX_ADDRESS_MIRROR, + PL_TEX_SAMPLE_NEAREST, "prev", NULL, NULL, NULL); + if (!prev2) + return; + } + + if (src->next.top && src->next.top != src->cur.top) { + pl_assert(src->next.top->params.w == texparams->w); + pl_assert(src->next.top->params.h == texparams->h); + next2 = sh_bind(sh, src->next.top, PL_TEX_ADDRESS_MIRROR, + PL_TEX_SAMPLE_NEAREST, "next", NULL, NULL, NULL); + if (!next2) + return; + } + + enum pl_field first_field = PL_DEF(src->first_field, PL_FIELD_TOP); + ident_t prev1 = src->field == first_field ? prev2 : cur; + ident_t next1 = src->field == first_field ? cur : next2; + + GLSL("T A = GET("$", 0, -1); \n" + "T B = GET("$", 0, 1); \n" + "T C = GET("$", 0, -2); \n" + "T D = GET("$", 0, 0); \n" + "T E = GET("$", 0, +2); \n" + "T F = GET("$", 0, -1); \n" + "T G = GET("$", 0, +1); \n" + "T H = GET("$", 0, -2); \n" + "T I = GET("$", 0, 0); \n" + "T J = GET("$", 0, +2); \n" + "T K = GET("$", 0, -1); \n" + "T L = GET("$", 0, +1); \n", + prev2, prev2, + prev1, prev1, prev1, + cur, cur, + next1, next1, next1, + next2, next2); + + if (num_comps == 1) { + GLSL("res = "$"(A, B, C, D, E, F, G, H, I, J, K, L, res); \n", temporal_pred); + } else { + for (uint8_t i = 0; i < num_comps; i++) { + char c = "xyzw"[i]; + GLSL("res.%c = "$"(A.%c, B.%c, C.%c, D.%c, E.%c, F.%c, \n" + " G.%c, H.%c, I.%c, J.%c, K.%c, L.%c, \n" + " res.%c); \n", + c, temporal_pred, c, c, c, c, c, c, c, c, c, c, c, c, c); + } + } + break; + } + + case PL_DEINTERLACE_ALGORITHM_COUNT: + pl_unreachable(); + } + + GLSL("}\n"); // End of primary/secondary field branch + +done: + GLSL("color.%s = res; \n" + "#undef T \n" + "#undef GET \n" + "} \n", + swiz); +} diff --git a/src/shaders/dithering.c b/src/shaders/dithering.c new file mode 100644 index 0000000..4485d11 --- /dev/null +++ b/src/shaders/dithering.c @@ -0,0 +1,527 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include "shaders.h" + +#include <libplacebo/shaders/dithering.h> + +const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS }; + +struct sh_dither_obj { + pl_shader_obj lut; +}; + +static void sh_dither_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_dither_obj *obj = ptr; + pl_shader_obj_destroy(&obj->lut); + *obj = (struct sh_dither_obj) {0}; +} + +static void fill_dither_matrix(void *data, const struct sh_lut_params *params) +{ + pl_assert(params->width > 0 && params->height > 0 && params->comps == 1); + + const struct pl_dither_params *dpar = params->priv; + switch (dpar->method) { + case PL_DITHER_ORDERED_LUT: + pl_assert(params->width == params->height); + pl_generate_bayer_matrix(data, params->width); + return; + + case PL_DITHER_BLUE_NOISE: + pl_assert(params->width == params->height); + pl_generate_blue_noise(data, params->width); + return; + + case PL_DITHER_ORDERED_FIXED: + case PL_DITHER_WHITE_NOISE: + case PL_DITHER_METHOD_COUNT: + return; + } + + pl_unreachable(); +} + +static bool dither_method_is_lut(enum pl_dither_method method) +{ + switch (method) { + case PL_DITHER_BLUE_NOISE: + case PL_DITHER_ORDERED_LUT: + return true; + case PL_DITHER_ORDERED_FIXED: + case PL_DITHER_WHITE_NOISE: + return false; + case PL_DITHER_METHOD_COUNT: + break; + } + + pl_unreachable(); +} + +static inline float approx_gamma(enum pl_color_transfer trc) +{ + switch (trc) { + case PL_COLOR_TRC_UNKNOWN: return 1.0f; + case PL_COLOR_TRC_LINEAR: return 1.0f; + case PL_COLOR_TRC_PRO_PHOTO:return 1.8f; + case PL_COLOR_TRC_GAMMA18: return 1.8f; + case PL_COLOR_TRC_GAMMA20: return 2.0f; + case PL_COLOR_TRC_GAMMA24: return 2.4f; + case PL_COLOR_TRC_GAMMA26: return 2.6f; + case PL_COLOR_TRC_ST428: return 2.6f; + case PL_COLOR_TRC_GAMMA28: return 2.8f; + + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_GAMMA22: + return 2.2f; + + case PL_COLOR_TRC_PQ: + case PL_COLOR_TRC_HLG: + case PL_COLOR_TRC_V_LOG: + case PL_COLOR_TRC_S_LOG1: + case PL_COLOR_TRC_S_LOG2: + return 2.0f; // TODO: handle this better + + case PL_COLOR_TRC_COUNT: break; + } + + pl_unreachable(); +} + +void pl_shader_dither(pl_shader sh, int new_depth, + pl_shader_obj *dither_state, + const struct pl_dither_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + if (new_depth <= 0 || new_depth > 256) { + PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth); + return; + } + + sh_describef(sh, "dithering (%d bits)", new_depth); + GLSL("// pl_shader_dither \n" + "{ \n" + "float bias; \n"); + + params = PL_DEF(params, &pl_dither_default_params); + if (params->lut_size < 0 || params->lut_size > 8) { + SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size); + return; + } + + enum pl_dither_method method = params->method; + ident_t lut = NULL_IDENT; + int lut_size = 0; + + if (dither_method_is_lut(method)) { + if (!dither_state) { + PL_WARN(sh, "LUT-based dither method specified but no dither state " + "object given, falling back to non-LUT based methods."); + goto fallback; + } + + struct sh_dither_obj *obj; + obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER, + struct sh_dither_obj, sh_dither_uninit); + if (!obj) + goto fallback; + + bool cache = method == PL_DITHER_BLUE_NOISE; + lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size); + lut = sh_lut(sh, sh_lut_params( + .object = &obj->lut, + .var_type = PL_VAR_FLOAT, + .width = lut_size, + .height = lut_size, + .comps = 1, + .fill = fill_dither_matrix, + .signature = (CACHE_KEY_DITHER ^ method) * lut_size, + .cache = cache ? SH_CACHE(sh) : NULL, + .priv = (void *) params, + )); + if (!lut) + goto fallback; + } + + goto done; + +fallback: + method = PL_DITHER_ORDERED_FIXED; + // fall through + +done: ; + + int size = 0; + if (lut) { + size = lut_size; + } else if (method == PL_DITHER_ORDERED_FIXED) { + size = 16; // hard-coded size + } + + if (size) { + // Transform the screen position to the cyclic range [0,1) + GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size)); + + if (params->temporal) { + int phase = SH_PARAMS(sh).index % 8; + float r = phase * (M_PI / 2); // rotate + float m = phase < 4 ? 1 : -1; // mirror + float mat[2][2] = { + {cos(r), -sin(r) }, + {sin(r) * m, cos(r) * m}, + }; + + ident_t rot = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat2("dither_rot"), + .data = &mat[0][0], + .dynamic = true, + }); + GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot); + } + } + + switch (method) { + case PL_DITHER_WHITE_NOISE: { + ident_t prng = sh_prng(sh, params->temporal, NULL); + GLSL("bias = "$".x;\n", prng); + break; + } + + case PL_DITHER_ORDERED_FIXED: + // Bitwise ordered dither using only 32-bit uints + GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u; \n" + // Bitwise merge (morton number) + "xy.x = xy.x ^ xy.y; \n" + "xy = (xy | xy << 2) & uvec2(0x33333333); \n" + "xy = (xy | xy << 1) & uvec2(0x55555555); \n" + // Bitwise inversion + "uint b = xy.x + (xy.y << 1); \n" + "b = (b * 0x0802u & 0x22110u) | \n" + " (b * 0x8020u & 0x88440u); \n" + "b = 0x10101u * b; \n" + "b = (b >> 16) & 0xFFu; \n" + // Generate bias value + "bias = float(b) * 1.0/256.0; \n"); + break; + + case PL_DITHER_BLUE_NOISE: + case PL_DITHER_ORDERED_LUT: + pl_assert(lut); + GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size)); + break; + + case PL_DITHER_METHOD_COUNT: + pl_unreachable(); + } + + // Scale factor for dither rounding + GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1); + + const float gamma = approx_gamma(params->transfer); + if (gamma != 1.0f && new_depth <= 4) { + GLSL("const float gamma = "$"; \n" + "vec4 color_lin = pow(color, vec4(gamma)); \n", + SH_FLOAT(gamma)); + + if (new_depth == 1) { + // Special case for bit depth 1 dithering, in this case we can just + // ignore the low/high rounding because we know we are always + // dithering between 0.0 and 1.0. + GLSL("const vec4 low = vec4(0.0); \n" + "const vec4 high = vec4(1.0); \n" + "vec4 offset = color_lin; \n"); + } else { + // Linearize the low, high and current color values + GLSL("vec4 low = floor(color * scale) / scale; \n" + "vec4 high = ceil(color * scale) / scale; \n" + "vec4 low_lin = pow(low, vec4(gamma)); \n" + "vec4 high_lin = pow(high, vec4(gamma)); \n" + "vec4 range = high_lin - low_lin; \n" + "vec4 offset = (color_lin - low_lin) / \n" + " max(range, 1e-6); \n"); + } + + // Mix in the correct ratio corresponding to the offset and bias + GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n"); + } else { + // Approximate each gamma segment as a straight line, this simplifies + // the process of dithering down to a single scale and (biased) round. + GLSL("color = scale * color + vec4(bias); \n" + "color = floor(color) * (1.0 / scale); \n"); + } + + GLSL("} \n"); +} + +/* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that +// will be affected by the current column. +static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k) +{ + int ret = 0; + for (int y = 0; y <= PL_EDF_MAX_DY; y++) { + for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { + if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) { + int shifted_x = x + y * k->shift; + + // The shift mapping guarantees current column (or left of it) + // won't be affected by error diffusion. + assert(shifted_x > 0); + + ret = PL_MAX(ret, shifted_x); + } + } + } + return ret; +} + +size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel, + int height) +{ + // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors + // propagated out from bottom side. + int rows = height + PL_EDF_MAX_DY; + int shifted_columns = compute_rightmost_shifted_column(kernel) + 1; + + // The shared memory is an array of size rows*shifted_columns. Each element + // is a single uint for three RGB component. + return rows * shifted_columns * sizeof(uint32_t); +} + +bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params) +{ + const int width = params->input_tex->params.w, height = params->input_tex->params.h; + const struct pl_glsl_version glsl = sh_glsl(sh); + const struct pl_error_diffusion_kernel *kernel = + PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite); + + pl_assert(params->output_tex->params.w == width); + pl_assert(params->output_tex->params.h == height); + if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height)) + return false; + + if (params->new_depth <= 0 || params->new_depth > 256) { + PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth); + return false; + } + + // The parallel error diffusion works by applying the shift mapping first. + // Taking the Floyd and Steinberg algorithm for example. After applying + // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are + // propagated into the next few columns, which makes parallel processing on + // the same column possible. + // + // X 7/16 X 7/16 + // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16 + + // Figuring out the size of rectangle containing all shifted pixels. + // The rectangle height is not changed. + int shifted_width = width + (height - 1) * kernel->shift; + + // We process all pixels from the shifted rectangles column by column, with + // a single global work group of size |block_size|. + // Figuring out how many block are required to process all pixels. We need + // this explicitly to make the number of barrier() calls match. + int block_size = PL_MIN(glsl.max_group_threads, height); + int blocks = PL_DIV_UP(height * shifted_width, block_size); + + // If we figure out how many of the next columns will be affected while the + // current columns is being processed. We can store errors of only a few + // columns in the shared memory. Using a ring buffer will further save the + // cost while iterating to next column. + // + int ring_buffer_rows = height + PL_EDF_MAX_DY; + int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1; + ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_UINT, + .name = "ring_buffer_size", + .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns }, + .compile_time = true, + }); + + // Compute shared memory requirements and try enabling compute shader. + size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t); + if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) { + PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or " + "insufficient compute shader memory!"); + return false; + } + + ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->input_tex, + .desc = { + .name = "input_tex", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + ident_t out_img = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->output_tex, + .desc = { + .name = "output_tex", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + }); + + sh->output = PL_SHADER_SIG_NONE; + sh_describef(sh, "error diffusion (%s, %d bits)", + kernel->name, params->new_depth); + + // Defines the ring buffer in shared memory. + GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size); + GLSL("// pl_shader_error_diffusion \n" + // Safeguard against accidental over-execution + "if (gl_WorkGroupID != uvec3(0)) \n" + " return; \n" + // Initialize the ring buffer. + "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n" + " err_rgb8[i] = 0u; \n" + + // Main block loop, add barrier here to have previous block all + // processed before starting the processing of the next. + "for (uint block_id = 0; block_id < "$"; block_id++) { \n" + "barrier(); \n" + // Compute the coordinate of the pixel we are currently processing, + // both before and after the shift mapping. + "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex; \n" + "const uint height = "$"; \n" + "int y = int(id %% height), x_shifted = int(id / height); \n" + "int x = x_shifted - y * %d; \n" + // Proceed only if we are processing a valid pixel. + "if (x >= 0 && x < "$") { \n" + // The index that the current pixel have on the ring buffer. + "uint idx = uint(x_shifted * "$" + y) %% "$"; \n" + // Fetch the current pixel. + "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0); \n" + "vec3 pix = pix_orig.rgb; \n", + ring_buffer_size, + SH_UINT(blocks), + SH_UINT(height), + kernel->shift, + SH_INT(width), + SH_INT(ring_buffer_rows), + ring_buffer_size, + in_tex); + + // The dithering will quantize pixel value into multiples of 1/dither_quant. + int dither_quant = (1 << params->new_depth) - 1; + + // We encode errors in RGB components into a single 32-bit unsigned integer. + // The error we propagate from the current pixel is in range of + // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the + // sum of all errors been propagated into a pixel is also in the same range. + // It's possible to map errors in this range into [-127, 127], and use an + // unsigned 8-bit integer to store it (using standard two's complement). + // The three 8-bit unsigned integers can then be encoded into a single + // 32-bit unsigned integer, with two 4-bit padding to prevent addition + // operation overflows affecting other component. There are at most 12 + // addition operations on each pixel, so 4-bit padding should be enough. + // The overflow from R component will be discarded. + // + // The following figure is how the encoding looks like. + // + // +------------------------------------+ + // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB| + // +------------------------------------+ + // + + // The bitshift position for R and G component. + const int bitshift_r = 24, bitshift_g = 12; + // The multiplier we use to map [-0.5, 0.5] to [-127, 127]. + const int uint8_mul = 127 * 2; + + GLSL(// Add the error previously propagated into current pixel, and clear + // it in the ring buffer. + "uint err_u32 = err_rgb8[idx] + %uu; \n" + "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128, \n" + " int((err_u32 >> %d) & 0xFFu) - 128, \n" + " int( err_u32 & 0xFFu) - 128) / %d.0; \n" + "err_rgb8[idx] = 0u; \n" + // Write the dithered pixel. + "vec3 dithered = round(pix); \n" + "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a)); \n" + // Prepare for error propagation pass + "vec3 err_divided = (pix - dithered) * %d.0 / %d.0; \n" + "ivec3 tmp; \n", + (128u << bitshift_r) | (128u << bitshift_g) | 128u, + dither_quant, bitshift_r, bitshift_g, uint8_mul, + out_img, dither_quant, + uint8_mul, kernel->divisor); + + // Group error propagation with same weight factor together, in order to + // reduce the number of annoying error encoding. + for (int dividend = 1; dividend <= kernel->divisor; dividend++) { + bool err_assigned = false; + + for (int y = 0; y <= PL_EDF_MAX_DY; y++) { + for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { + if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend) + continue; + + if (!err_assigned) { + err_assigned = true; + + GLSL("tmp = ivec3(round(err_divided * %d.0)); \n" + "err_u32 = (uint(tmp.r & 0xFF) << %d) | \n" + " (uint(tmp.g & 0xFF) << %d) | \n" + " uint(tmp.b & 0xFF); \n", + dividend, + bitshift_r, bitshift_g); + } + + int shifted_x = x + y * kernel->shift; + + // Unlike the right border, errors propagated out from left + // border will remain in the ring buffer. This will produce + // visible artifacts near the left border, especially for + // shift=3 kernels. + if (x < 0) + GLSL("if (x >= %d) \n", -x); + + // Calculate the new position in the ring buffer to propagate + // the error into. + int ring_buffer_delta = shifted_x * ring_buffer_rows + y; + GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n", + ring_buffer_delta, ring_buffer_size); + } + } + } + + GLSL("}} \n"); // end of main loop + valid pixel conditional + return true; +} diff --git a/src/shaders/film_grain.c b/src/shaders/film_grain.c new file mode 100644 index 0000000..b1d25ff --- /dev/null +++ b/src/shaders/film_grain.c @@ -0,0 +1,65 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" +#include "shaders/film_grain.h" + +bool pl_needs_film_grain(const struct pl_film_grain_params *params) +{ + switch (params->data.type) { + case PL_FILM_GRAIN_NONE: return false; + case PL_FILM_GRAIN_AV1: return pl_needs_fg_av1(params); + case PL_FILM_GRAIN_H274: return pl_needs_fg_h274(params); + default: pl_unreachable(); + } +} + +struct sh_grain_obj { + pl_shader_obj av1; + pl_shader_obj h274; +}; + +static void sh_grain_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_grain_obj *obj = ptr; + pl_shader_obj_destroy(&obj->av1); + pl_shader_obj_destroy(&obj->h274); +} + +bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state, + const struct pl_film_grain_params *params) +{ + if (!pl_needs_film_grain(params)) { + // FIXME: Instead of erroring, sample directly + SH_FAIL(sh, "pl_shader_film_grain called but no film grain needs to be " + "applied, test with `pl_needs_film_grain` first!"); + return false; + } + + struct sh_grain_obj *obj; + obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_FILM_GRAIN, + struct sh_grain_obj, sh_grain_uninit); + if (!obj) + return false; + + switch (params->data.type) { + case PL_FILM_GRAIN_NONE: return false; + case PL_FILM_GRAIN_AV1: return pl_shader_fg_av1(sh, &obj->av1, params); + case PL_FILM_GRAIN_H274: return pl_shader_fg_h274(sh, &obj->h274, params); + default: pl_unreachable(); + } +} diff --git a/src/shaders/film_grain.h b/src/shaders/film_grain.h new file mode 100644 index 0000000..f6498c1 --- /dev/null +++ b/src/shaders/film_grain.h @@ -0,0 +1,75 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +#include <libplacebo/shaders/film_grain.h> + +bool pl_needs_fg_av1(const struct pl_film_grain_params *); +bool pl_needs_fg_h274(const struct pl_film_grain_params *); + +bool pl_shader_fg_av1(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *); +bool pl_shader_fg_h274(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *); + +// Common helper function +static inline enum pl_channel channel_map(int i, const struct pl_film_grain_params *params) +{ + static const enum pl_channel map_rgb[3] = { + [PL_CHANNEL_G] = PL_CHANNEL_Y, + [PL_CHANNEL_B] = PL_CHANNEL_CB, + [PL_CHANNEL_R] = PL_CHANNEL_CR, + }; + + static const enum pl_channel map_xyz[3] = { + [1] = PL_CHANNEL_Y, // Y + [2] = PL_CHANNEL_CB, // Z + [0] = PL_CHANNEL_CR, // X + }; + + if (i >= params->components) + return PL_CHANNEL_NONE; + + int comp = params->component_mapping[i]; + if (comp < 0 || comp > 2) + return PL_CHANNEL_NONE; + + switch (params->repr->sys) { + case PL_COLOR_SYSTEM_UNKNOWN: + case PL_COLOR_SYSTEM_RGB: + return map_rgb[comp]; + case PL_COLOR_SYSTEM_XYZ: + return map_xyz[comp]; + + case PL_COLOR_SYSTEM_BT_601: + case PL_COLOR_SYSTEM_BT_709: + case PL_COLOR_SYSTEM_SMPTE_240M: + case PL_COLOR_SYSTEM_BT_2020_NC: + case PL_COLOR_SYSTEM_BT_2020_C: + case PL_COLOR_SYSTEM_BT_2100_PQ: + case PL_COLOR_SYSTEM_BT_2100_HLG: + case PL_COLOR_SYSTEM_DOLBYVISION: + case PL_COLOR_SYSTEM_YCGCO: + return comp; + + case PL_COLOR_SYSTEM_COUNT: + break; + } + + pl_unreachable(); +} diff --git a/src/shaders/film_grain_av1.c b/src/shaders/film_grain_av1.c new file mode 100644 index 0000000..3b11ea3 --- /dev/null +++ b/src/shaders/film_grain_av1.c @@ -0,0 +1,1001 @@ +/* + * This file is part of libplacebo, which is normally licensed under the terms + * of the LGPL v2.1+. However, this file (film_grain_av1.c) is also available + * under the terms of the more permissive MIT license: + * + * Copyright (c) 2018-2019 Niklas Haas + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "shaders.h" +#include "shaders/film_grain.h" + +// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512 +static const int16_t gaussian_sequence[2048] = { + 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, + 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, + 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, + -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, + 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, + 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, + 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, + 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, + 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, + 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, + 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, + -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, + 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, + 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, + -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, + -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, + -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, + -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, + 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, + 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, + 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, + -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, + -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, + -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, + 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, + 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, + 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, + -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, + 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, + -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, + 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, + -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, + 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, + -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, + -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, + -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, + -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, + -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, + 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, + 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, + -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, + -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, + 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, + 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, + -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, + 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, + 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, + -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, + 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, + -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, + 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, + -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, + -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, + 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, + -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, + -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, + 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, + 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, + -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, + 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, + 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, + 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, + -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, + -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, + -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, + 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, + -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, + -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, + -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, + -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, + -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, + 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, + -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, + -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, + 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, + -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, + -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, + -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, + 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, + -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, + 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, + 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, + 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, + -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, + -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, + 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, + 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, + -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, + -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, + -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, + -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, + 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, + 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, + 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, + 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, + 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, + 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, + 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, + -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, + 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, + -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, + -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, + -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, + 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, + -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, + -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, + 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, + 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, + 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, + 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, + 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, + 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, + 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, + -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, + -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, + -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, + 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, + -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, + -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, + 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, + -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, + 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, + 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, + 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, + -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, + 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, + -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, + 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, + 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, + 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, + 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, + -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, + -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, + 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, + -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, + 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, + 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, + 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, + -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, + -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, + 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, + 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, + 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, + -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, + -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, + 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, + -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, + -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, + -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, + 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, + -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, + 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, + -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, + 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, + -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, + 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, + 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, + 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, + 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, + -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, + -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, + -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, + -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, + 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, + 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, + 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, + 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, + -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, + 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, + -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, + 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, + 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, + -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, + -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, + -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, + -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, + 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, + -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, + -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, + -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, + -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, + 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, + 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, + -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, + -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, + 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, + 428, -484 +}; + +static inline int get_random_number(int bits, uint16_t *state) +{ + int r = *state; + uint16_t bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; + *state = (r >> 1) | (bit << 15); + + return (*state >> (16 - bits)) & ((1 << bits) - 1); +} + +static inline int round2(int x, int shift) +{ + if (!shift) + return x; + + return (x + (1 << (shift - 1))) >> shift; +} + +enum { + BLOCK_SIZE = 32, + SCALING_LUT_SIZE = 256, + + GRAIN_WIDTH = 82, + GRAIN_HEIGHT = 73, + // On the GPU we only need a subsection of this + GRAIN_WIDTH_LUT = 64, + GRAIN_HEIGHT_LUT = 64, + GRAIN_PAD_LUT = 9, + + // For subsampled grain textures + SUB_GRAIN_WIDTH = 44, + SUB_GRAIN_HEIGHT = 38, + SUB_GRAIN_WIDTH_LUT = GRAIN_WIDTH_LUT >> 1, + SUB_GRAIN_HEIGHT_LUT = GRAIN_HEIGHT_LUT >> 1, + SUB_GRAIN_PAD_LUT = 6, +}; + +// Contains the shift by which the offsets are indexed +enum offset { + OFFSET_TL = 24, + OFFSET_T = 16, + OFFSET_L = 8, + OFFSET_N = 0, +}; + +// Helper function to compute some common constants +struct grain_scale { + int grain_center; + int grain_min; + int grain_max; + float texture_scale; + float grain_scale; +}; + +static inline int bit_depth(const struct pl_color_repr *repr) +{ + int depth = PL_DEF(repr->bits.color_depth, + PL_DEF(repr->bits.sample_depth, 8)); + pl_assert(depth >= 8); + return PL_MIN(depth, 12); +} + +static struct grain_scale get_grain_scale(const struct pl_film_grain_params *params) +{ + int bits = bit_depth(params->repr); + struct grain_scale ret = { + .grain_center = 128 << (bits - 8), + }; + + ret.grain_min = -ret.grain_center; + ret.grain_max = (256 << (bits - 8)) - 1 - ret.grain_center; + + struct pl_color_repr repr = *params->repr; + ret.texture_scale = pl_color_repr_normalize(&repr); + + // Since our color samples are normalized to the range [0, 1], we need to + // scale down grain values from the scale [0, 2^b - 1] to this range. + ret.grain_scale = 1.0 / ((1 << bits) - 1); + + return ret; +} + +// Generates the basic grain table (LumaGrain in the spec). +static void generate_grain_y(float out[GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT], + int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH], + const struct pl_film_grain_params *params) +{ + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + struct grain_scale scale = get_grain_scale(params); + uint16_t seed = (uint16_t) params->data.seed; + int bits = bit_depth(params->repr); + int shift = 12 - bits + data->grain_scale_shift; + pl_assert(shift >= 0); + + for (int y = 0; y < GRAIN_HEIGHT; y++) { + for (int x = 0; x < GRAIN_WIDTH; x++) { + int16_t value = gaussian_sequence[ get_random_number(11, &seed) ]; + buf[y][x] = round2(value, shift); + } + } + + const int ar_pad = 3; + int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { + for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_y; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + if (!dx && !dy) + break; + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max); + buf[y][x] = grain; + } + } + + for (int y = 0; y < GRAIN_HEIGHT_LUT; y++) { + for (int x = 0; x < GRAIN_WIDTH_LUT; x++) { + int16_t grain = buf[y + GRAIN_PAD_LUT][x + GRAIN_PAD_LUT]; + out[y][x] = grain * scale.grain_scale; + } + } +} + +static void generate_grain_uv(float *out, int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH], + const int16_t buf_y[GRAIN_HEIGHT][GRAIN_WIDTH], + enum pl_channel channel, int sub_x, int sub_y, + const struct pl_film_grain_params *params) +{ + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + struct grain_scale scale = get_grain_scale(params); + int bits = bit_depth(params->repr); + int shift = 12 - bits + data->grain_scale_shift; + pl_assert(shift >= 0); + + uint16_t seed = params->data.seed; + if (channel == PL_CHANNEL_CB) { + seed ^= 0xb524; + } else if (channel == PL_CHANNEL_CR) { + seed ^= 0x49d8; + } + + int chromaW = sub_x ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; + int chromaH = sub_y ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; + + const int8_t *coeffs[] = { + [PL_CHANNEL_CB] = data->ar_coeffs_uv[0], + [PL_CHANNEL_CR] = data->ar_coeffs_uv[1], + }; + + for (int y = 0; y < chromaH; y++) { + for (int x = 0; x < chromaW; x++) { + int16_t value = gaussian_sequence[ get_random_number(11, &seed) ]; + buf[y][x] = round2(value, shift); + } + } + + const int ar_pad = 3; + int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < chromaH; y++) { + for (int x = ar_pad; x < chromaW - ar_pad; x++) { + const int8_t *coeff = coeffs[channel]; + pl_assert(coeff); + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + // For the final (current) pixel, we need to add in the + // contribution from the luma grain texture + if (!dx && !dy) { + if (!data->num_points_y) + break; + int luma = 0; + int lumaX = ((x - ar_pad) << sub_x) + ar_pad; + int lumaY = ((y - ar_pad) << sub_y) + ar_pad; + for (int i = 0; i <= sub_y; i++) { + for (int j = 0; j <= sub_x; j++) { + luma += buf_y[lumaY + i][lumaX + j]; + } + } + luma = round2(luma, sub_x + sub_y); + sum += luma * (*coeff); + break; + } + + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max); + buf[y][x] = grain; + } + } + + int lutW = GRAIN_WIDTH_LUT >> sub_x; + int lutH = GRAIN_HEIGHT_LUT >> sub_y; + int padX = sub_x ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT; + int padY = sub_y ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT; + + for (int y = 0; y < lutH; y++) { + for (int x = 0; x < lutW; x++) { + int16_t grain = buf[y + padY][x + padX]; + out[y * lutW + x] = grain * scale.grain_scale; + } + } +} + +static void generate_offsets(void *pbuf, const struct sh_lut_params *params) +{ + const struct pl_film_grain_data *data = params->priv; + unsigned int *buf = pbuf; + pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t)); + + for (int y = 0; y < params->height; y++) { + uint16_t state = data->seed; + state ^= ((y * 37 + 178) & 0xFF) << 8; + state ^= ((y * 173 + 105) & 0xFF); + + for (int x = 0; x < params->width; x++) { + unsigned int *offsets = &buf[y * params->width + x]; + + uint8_t val = get_random_number(8, &state); + uint8_t val_l = x ? (offsets - 1)[0] : 0; + uint8_t val_t = y ? (offsets - params->width)[0] : 0; + uint8_t val_tl = x && y ? (offsets - params->width - 1)[0] : 0; + + // Encode four offsets into a single 32-bit integer for the + // convenience of the GPU. That way only one LUT fetch is + // required for the entire block. + *offsets = ((uint32_t) val_tl << OFFSET_TL) + | ((uint32_t) val_t << OFFSET_T) + | ((uint32_t) val_l << OFFSET_L) + | ((uint32_t) val << OFFSET_N); + } + } +} + +static void generate_scaling(void *pdata, const struct sh_lut_params *params) +{ + assert(params->width == SCALING_LUT_SIZE && params->comps == 1); + float *data = pdata; + + struct { + int num; + uint8_t (*points)[2]; + const struct pl_av1_grain_data *data; + } *ctx = params->priv; + + float range = 1 << ctx->data->scaling_shift; + + // Fill up the preceding entries with the initial value + for (int i = 0; i < ctx->points[0][0]; i++) + data[i] = ctx->points[0][1] / range; + + // Linearly interpolate the values in the middle + for (int i = 0; i < ctx->num - 1; i++) { + int bx = ctx->points[i][0]; + int by = ctx->points[i][1]; + int dx = ctx->points[i + 1][0] - bx; + int dy = ctx->points[i + 1][1] - by; + int delta = dy * ((0x10000 + (dx >> 1)) / dx); + for (int x = 0; x < dx; x++) { + int v = by + ((x * delta + 0x8000) >> 16); + data[bx + x] = v / range; + } + } + + // Fill up the remaining entries with the final value + for (int i = ctx->points[ctx->num - 1][0]; i < SCALING_LUT_SIZE; i++) + data[i] = ctx->points[ctx->num - 1][1] / range; +} + +static void sample(pl_shader sh, enum offset off, ident_t lut, int idx, + int sub_x, int sub_y) +{ + int dx = (off & OFFSET_L) ? 1 : 0, + dy = (off & OFFSET_T) ? 1 : 0; + + static const char *index_strs[] = { + [0] = ".x", + [1] = ".y", + }; + + GLSL("offset = uvec2(%du, %du) * uvec2((data >> %d) & 0xFu, \n" + " (data >> %d) & 0xFu);\n" + "pos = offset + local_id.xy + uvec2(%d, %d); \n" + "val = "$"(pos)%s; \n", + sub_x ? 1 : 2, sub_y ? 1 : 2, off + 4, off, + (BLOCK_SIZE >> sub_x) * dx, + (BLOCK_SIZE >> sub_y) * dy, + lut, idx >= 0 ? index_strs[idx] : ""); +} + +struct grain_obj_av1 { + // LUT objects for the offsets, grain and scaling luts + pl_shader_obj lut_offsets; + pl_shader_obj lut_grain[2]; + pl_shader_obj lut_scaling[3]; + + // Previous parameters used to check reusability + struct pl_film_grain_data data; + struct pl_color_repr repr; + bool fg_has_y; + bool fg_has_u; + bool fg_has_v; + + // Space to store the temporary arrays, reused + uint32_t *offsets; + float grain[2][GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT]; + int16_t grain_tmp_y[GRAIN_HEIGHT][GRAIN_WIDTH]; + int16_t grain_tmp_uv[GRAIN_HEIGHT][GRAIN_WIDTH]; +}; + +static void av1_grain_uninit(pl_gpu gpu, void *ptr) +{ + struct grain_obj_av1 *obj = ptr; + pl_shader_obj_destroy(&obj->lut_offsets); + for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_grain); i++) + pl_shader_obj_destroy(&obj->lut_grain[i]); + for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_scaling); i++) + pl_shader_obj_destroy(&obj->lut_scaling[i]); + *obj = (struct grain_obj_av1) {0}; +} + +bool pl_needs_fg_av1(const struct pl_film_grain_params *params) +{ + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + bool has_y = data->num_points_y > 0; + bool has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma; + bool has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma; + + for (int i = 0; i < 3; i++) { + enum pl_channel channel = channel_map(i, params); + if (channel == PL_CHANNEL_Y && has_y) + return true; + if (channel == PL_CHANNEL_CB && has_u) + return true; + if (channel == PL_CHANNEL_CR && has_v) + return true; + } + + return false; +} + +static inline bool av1_grain_data_eq(const struct pl_film_grain_data *da, + const struct pl_film_grain_data *db) +{ + const struct pl_av1_grain_data *a = &da->params.av1, *b = &db->params.av1; + + // Only check the fields that are relevant for grain LUT generation + return da->seed == db->seed && + a->chroma_scaling_from_luma == b->chroma_scaling_from_luma && + a->scaling_shift == b->scaling_shift && + a->ar_coeff_lag == b->ar_coeff_lag && + a->ar_coeff_shift == b->ar_coeff_shift && + a->grain_scale_shift == b->grain_scale_shift && + !memcmp(a->ar_coeffs_y, b->ar_coeffs_y, sizeof(a->ar_coeffs_y)) && + !memcmp(a->ar_coeffs_uv, b->ar_coeffs_uv, sizeof(a->ar_coeffs_uv)); +} + +static void fill_grain_lut(void *data, const struct sh_lut_params *params) +{ + struct grain_obj_av1 *obj = params->priv; + size_t entries = params->width * params->height * params->comps; + memcpy(data, obj->grain, entries * sizeof(float)); +} + +bool pl_shader_fg_av1(pl_shader sh, pl_shader_obj *grain_state, + const struct pl_film_grain_params *params) +{ + int sub_x = 0, sub_y = 0; + int tex_w = params->tex->params.w, + tex_h = params->tex->params.h; + + if (params->luma_tex) { + sub_x = params->luma_tex->params.w > tex_w; + sub_y = params->luma_tex->params.h > tex_h; + } + + const struct pl_av1_grain_data *data = ¶ms->data.params.av1; + bool fg_has_y = data->num_points_y > 0; + bool fg_has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma; + bool fg_has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma; + + bool tex_is_y = false, tex_is_cb = false, tex_is_cr = false; + for (int i = 0; i < 3; i++) { + switch (channel_map(i, params)) { + case PL_CHANNEL_Y: tex_is_y = true; break; + case PL_CHANNEL_CB: tex_is_cb = true; break; + case PL_CHANNEL_CR: tex_is_cr = true; break; + default: break; + }; + } + + if (tex_is_y && (sub_x || sub_y)) { + PL_WARN(sh, "pl_film_grain_params.channels includes PL_CHANNEL_Y but " + "plane is subsampled, this makes no sense. Continuing anyway " + "but output is likely incorrect."); + } + + if (!sh_require(sh, PL_SHADER_SIG_NONE, tex_w, tex_h)) + return false; + + pl_gpu gpu = SH_GPU(sh); + if (!gpu) { + PL_ERR(sh, "AV1 film grain synthesis requires a non-NULL pl_gpu!"); + return false; + } + + // Disable generation for unneeded component types + fg_has_y &= tex_is_y; + fg_has_u &= tex_is_cb; + fg_has_v &= tex_is_cr; + + int bw = BLOCK_SIZE >> sub_x; + int bh = BLOCK_SIZE >> sub_y; + bool is_compute = sh_try_compute(sh, bw, bh, false, sizeof(uint32_t)); + + struct grain_obj_av1 *obj; + obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_AV1_GRAIN, + struct grain_obj_av1, av1_grain_uninit); + if (!obj) + return false; + + // Note: In theory we could check only the parameters related to luma or + // only related to chroma and skip updating for changes to irrelevant + // parts, but this is probably not worth it since the seed is expected to + // change per frame anyway. + bool needs_update = !av1_grain_data_eq(¶ms->data, &obj->data) || + !pl_color_repr_equal(params->repr, &obj->repr) || + fg_has_y != obj->fg_has_y || + fg_has_u != obj->fg_has_u || + fg_has_v != obj->fg_has_v; + + if (needs_update) { + // This is needed even for chroma, so statically generate it + generate_grain_y(obj->grain[0], obj->grain_tmp_y, params); + } + + ident_t lut[3]; + int idx[3] = {-1}; + + if (fg_has_y) { + lut[0] = sh_lut(sh, sh_lut_params( + .object = &obj->lut_grain[0], + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .width = GRAIN_WIDTH_LUT, + .height = GRAIN_HEIGHT_LUT, + .comps = 1, + .update = needs_update, + .dynamic = true, + .fill = fill_grain_lut, + .priv = obj, + )); + + if (!lut[0]) { + SH_FAIL(sh, "Failed generating/uploading luma grain LUT!"); + return false; + } + } + + // Try merging the chroma LUTs into a single texture + int chroma_comps = 0; + if (fg_has_u) { + generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv, + obj->grain_tmp_y, PL_CHANNEL_CB, sub_x, sub_y, + params); + idx[1] = chroma_comps++; + } + if (fg_has_v) { + generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv, + obj->grain_tmp_y, PL_CHANNEL_CR, sub_x, sub_y, + params); + idx[2] = chroma_comps++; + } + + if (chroma_comps > 0) { + lut[1] = lut[2] = sh_lut(sh, sh_lut_params( + .object = &obj->lut_grain[1], + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .width = GRAIN_WIDTH_LUT >> sub_x, + .height = GRAIN_HEIGHT_LUT >> sub_y, + .comps = chroma_comps, + .update = needs_update, + .dynamic = true, + .fill = fill_grain_lut, + .priv = obj, + )); + + if (!lut[1]) { + SH_FAIL(sh, "Failed generating/uploading chroma grain LUT!"); + return false; + } + + if (chroma_comps == 1) + idx[1] = idx[2] = -1; + } + + ident_t offsets = sh_lut(sh, sh_lut_params( + .object = &obj->lut_offsets, + .var_type = PL_VAR_UINT, + .lut_type = SH_LUT_AUTO, + .width = PL_ALIGN2(tex_w << sub_x, 128) / 32, + .height = PL_ALIGN2(tex_h << sub_y, 128) / 32, + .comps = 1, + .update = needs_update, + .dynamic = true, + .fill = generate_offsets, + .priv = (void *) ¶ms->data, + )); + + if (!offsets) { + SH_FAIL(sh, "Failed generating/uploading block offsets LUT!"); + return false; + } + + // For the scaling LUTs, we assume they'll be relatively constant + // throughout the video so doing some extra work to avoid reinitializing + // them constantly is probably worth it. Probably. + const struct pl_av1_grain_data *obj_data = &obj->data.params.av1; + bool scaling_changed = false; + if (fg_has_y || data->chroma_scaling_from_luma) { + scaling_changed |= data->num_points_y != obj_data->num_points_y; + scaling_changed |= memcmp(data->points_y, obj_data->points_y, + sizeof(data->points_y)); + } + + if (fg_has_u && !data->chroma_scaling_from_luma) { + scaling_changed |= data->num_points_uv[0] != obj_data->num_points_uv[0]; + scaling_changed |= memcmp(data->points_uv[0], + obj_data->points_uv[0], + sizeof(data->points_uv[0])); + } + + if (fg_has_v && !data->chroma_scaling_from_luma) { + scaling_changed |= data->num_points_uv[1] != obj_data->num_points_uv[1]; + scaling_changed |= memcmp(data->points_uv[1], + obj_data->points_uv[1], + sizeof(data->points_uv[1])); + } + + ident_t scaling[3] = {0}; + for (int i = 0; i < 3; i++) { + struct { + int num; + const uint8_t (*points)[2]; + const struct pl_av1_grain_data *data; + } priv; + + priv.data = data; + if (i == 0 || data->chroma_scaling_from_luma) { + priv.num = data->num_points_y; + priv.points = &data->points_y[0]; + } else { + priv.num = data->num_points_uv[i - 1]; + priv.points = &data->points_uv[i - 1][0]; + } + + // Skip scaling for unneeded channels + bool has_c[3] = { fg_has_y, fg_has_u, fg_has_v }; + if (has_c[i] && priv.num > 0) { + scaling[i] = sh_lut(sh, sh_lut_params( + .object = &obj->lut_scaling[i], + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_LINEAR, + .width = SCALING_LUT_SIZE, + .comps = 1, + .update = scaling_changed, + .dynamic = true, + .fill = generate_scaling, + .priv = &priv, + )); + + if (!scaling[i]) { + SH_FAIL(sh, "Failed generating/uploading scaling LUTs!"); + return false; + } + } + } + + // Done updating LUTs + obj->data = params->data; + obj->repr = *params->repr; + obj->fg_has_y = fg_has_y; + obj->fg_has_u = fg_has_u; + obj->fg_has_v = fg_has_v; + + sh_describe(sh, "AV1 film grain"); + GLSL("vec4 color; \n" + "// pl_shader_film_grain (AV1) \n" + "{ \n" + "uvec2 offset; \n" + "uvec2 pos; \n" + "float val; \n" + "float grain; \n"); + + if (is_compute) { + GLSL("uvec2 block_id = gl_WorkGroupID.xy; \n" + "uvec2 local_id = gl_LocalInvocationID.xy; \n" + "uvec2 global_id = gl_GlobalInvocationID.xy; \n"); + } else { + GLSL("uvec2 global_id = uvec2(gl_FragCoord); \n" + "uvec2 block_id = global_id / uvec2(%d, %d); \n" + "uvec2 local_id = global_id - uvec2(%d, %d) * block_id; \n", + bw, bh, bw, bh); + } + + // Load the data vector which holds the offsets + if (is_compute) { + ident_t id = sh_fresh(sh, "data"); + GLSLH("shared uint "$"; \n", id); + GLSL("if (gl_LocalInvocationIndex == 0u) \n" + " "$" = uint("$"(block_id)); \n" + "barrier(); \n" + "uint data = "$"; \n", + id, offsets, id); + } else { + GLSL("uint data = uint("$"(block_id)); \n", offsets); + } + + struct grain_scale scale = get_grain_scale(params); + pl_color_repr_normalize(params->repr); + int bits = PL_DEF(params->repr->bits.color_depth, 8); + pl_assert(bits >= 8); + + ident_t minValue, maxLuma, maxChroma; + if (pl_color_levels_guess(params->repr) == PL_COLOR_LEVELS_LIMITED) { + float out_scale = (1 << bits) / ((1 << bits) - 1.0); + minValue = SH_FLOAT(16 / 256.0 * out_scale); + maxLuma = SH_FLOAT(235 / 256.0 * out_scale); + maxChroma = SH_FLOAT(240 / 256.0 * out_scale); + if (!pl_color_system_is_ycbcr_like(params->repr->sys)) + maxChroma = maxLuma; + } else { + minValue = SH_FLOAT(0.0); + maxLuma = SH_FLOAT(1.0); + maxChroma = SH_FLOAT(1.0); + } + + // Load the color value of the tex itself + ident_t tex = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = (struct pl_desc) { + .name = "tex", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + ident_t tex_scale = SH_FLOAT(scale.texture_scale); + GLSL("color = vec4("$") * texelFetch("$", ivec2(global_id), 0); \n", + tex_scale, tex); + + // If we need access to the external luma plane, load it now + if (tex_is_cb || tex_is_cr) { + GLSL("float averageLuma; \n"); + if (tex_is_y) { + // We already have the luma channel as part of the pre-sampled color + for (int i = 0; i < 3; i++) { + if (channel_map(i, params) == PL_CHANNEL_Y) { + GLSL("averageLuma = color["$"]; \n", SH_INT(i)); + break; + } + } + } else { + // Luma channel not present in image, attach it separately + pl_assert(params->luma_tex); + ident_t luma = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->luma_tex, + .desc = (struct pl_desc) { + .name = "luma", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + GLSL("pos = global_id * uvec2(%du, %du); \n" + "averageLuma = texelFetch("$", ivec2(pos), 0)["$"]; \n" + "averageLuma *= "$"; \n", + 1 << sub_x, 1 << sub_y, + luma, SH_INT(params->luma_comp), + tex_scale); + } + } + + ident_t grain_min = SH_FLOAT(scale.grain_min * scale.grain_scale); + ident_t grain_max = SH_FLOAT(scale.grain_max * scale.grain_scale); + + for (int i = 0; i < params->components; i++) { + enum pl_channel c = channel_map(i, params); + if (c == PL_CHANNEL_NONE) + continue; + if (!scaling[c]) + continue; + + sample(sh, OFFSET_N, lut[c], idx[c], sub_x, sub_y); + GLSL("grain = val; \n"); + + if (data->overlap) { + const char *weights[] = { "vec2(27.0, 17.0)", "vec2(23.0, 22.0)" }; + + // X-direction overlapping + GLSL("if (block_id.x > 0u && local_id.x < %du) { \n" + "vec2 w = %s / 32.0; \n" + "if (local_id.x == 1u) w.xy = w.yx; \n", + 2 >> sub_x, weights[sub_x]); + sample(sh, OFFSET_L, lut[c], idx[c], sub_x, sub_y); + GLSL("grain = dot(vec2(val, grain), w); \n" + "} \n"); + + // Y-direction overlapping + GLSL("if (block_id.y > 0u && local_id.y < %du) { \n" + "vec2 w = %s / 32.0; \n" + "if (local_id.y == 1u) w.xy = w.yx; \n", + 2 >> sub_y, weights[sub_y]); + + // We need to special-case the top left pixels since these need to + // pre-blend the top-left offset block before blending vertically + GLSL(" if (block_id.x > 0u && local_id.x < %du) {\n" + " vec2 w2 = %s / 32.0; \n" + " if (local_id.x == 1u) w2.xy = w2.yx; \n", + 2 >> sub_x, weights[sub_x]); + sample(sh, OFFSET_TL, lut[c], idx[c], sub_x, sub_y); + GLSL(" float tmp = val; \n"); + sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y); + GLSL(" val = dot(vec2(tmp, val), w2); \n" + " } else { \n"); + sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y); + GLSL(" } \n" + "grain = dot(vec2(val, grain), w); \n" + "} \n"); + + // Correctly clip the interpolated grain + GLSL("grain = clamp(grain, "$", "$"); \n", grain_min, grain_max); + } + + if (c == PL_CHANNEL_Y) { + GLSL("color[%d] += "$"(color[%d]) * grain; \n" + "color[%d] = clamp(color[%d], "$", "$"); \n", + i, scaling[c], i, + i, i, minValue, maxLuma); + } else { + GLSL("val = averageLuma; \n"); + if (!data->chroma_scaling_from_luma) { + // We need to load some extra variables for the mixing. Do this + // using sh_var instead of hard-coding them to avoid shader + // recompilation when these values change. + ident_t mult = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("mult"), + .data = &(float[2]){ + data->uv_mult_luma[c - 1] / 64.0, + data->uv_mult[c - 1] / 64.0, + }, + }); + + int c_offset = (unsigned) data->uv_offset[c - 1] << (bits - 8); + ident_t offset = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("offset"), + .data = &(float) { c_offset * scale.grain_scale }, + }); + + GLSL("val = dot(vec2(val, color[%d]), "$"); \n" + "val += "$"; \n", + i, mult, offset); + } + GLSL("color[%d] += "$"(val) * grain; \n" + "color[%d] = clamp(color[%d], "$", "$"); \n", + i, scaling[c], + i, i, minValue, maxChroma); + } + } + + GLSL("} \n"); + return true; +} diff --git a/src/shaders/film_grain_h274.c b/src/shaders/film_grain_h274.c new file mode 100644 index 0000000..6d524da --- /dev/null +++ b/src/shaders/film_grain_h274.c @@ -0,0 +1,815 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "shaders.h" +#include "shaders/film_grain.h" + +static const int8_t Gaussian_LUT[2048+4]; +static const uint32_t Seed_LUT[256]; +static const int8_t R64T[64][64]; + +static void prng_shift(uint32_t *state) +{ + // Primitive polynomial x^31 + x^3 + 1 (modulo 2) + uint32_t x = *state; + uint8_t feedback = 1u ^ (x >> 2) ^ (x >> 30); + *state = (x << 1) | (feedback & 1u); +} + + +static void generate_slice(float *out, size_t out_width, uint8_t h, uint8_t v, + int8_t grain[64][64], int16_t tmp[64][64]) +{ + const uint8_t freq_h = ((h + 3) << 2) - 1; + const uint8_t freq_v = ((v + 3) << 2) - 1; + uint32_t seed = Seed_LUT[h + v * 13]; + + // Initialize with random gaussian values, using the output array as a + // temporary buffer for these intermediate values. + // + // Note: To make the subsequent matrix multiplication cache friendlier, we + // store each *column* of the starting image in a *row* of `grain` + for (int y = 0; y <= freq_v; y++) { + for (int x = 0; x <= freq_h; x += 4) { + uint16_t offset = seed % 2048; + grain[x + 0][y] = Gaussian_LUT[offset + 0]; + grain[x + 1][y] = Gaussian_LUT[offset + 1]; + grain[x + 2][y] = Gaussian_LUT[offset + 2]; + grain[x + 3][y] = Gaussian_LUT[offset + 3]; + prng_shift(&seed); + } + } + + grain[0][0] = 0; + + // 64x64 inverse integer transform + for (int y = 0; y < 64; y++) { + for (int x = 0; x <= freq_h; x++) { + int32_t sum = 0; + for (int p = 0; p <= freq_v; p++) + sum += R64T[y][p] * grain[x][p]; + tmp[y][x] = (sum + 128) >> 8; + } + } + + for (int y = 0; y < 64; y++) { + for (int x = 0; x < 64; x++) { + int32_t sum = 0; + for (int p = 0; p <= freq_h; p++) + sum += tmp[y][p] * R64T[x][p]; // R64T^T = R64 + sum = (sum + 128) >> 8; + grain[y][x] = PL_CLAMP(sum, -127, 127); + } + } + + static const uint8_t deblock_factors[13] = { + 64, 71, 77, 84, 90, 96, 103, 109, 116, 122, 128, 128, 128 + }; + + // Deblock horizontal edges by simple attentuation of values + const uint8_t deblock_coeff = deblock_factors[v]; + for (int y = 0; y < 64; y++) { + switch (y % 8) { + case 0: case 7: + // Deblock + for (int x = 0; x < 64; x++) + out[x] = ((grain[y][x] * deblock_coeff) >> 7) / 255.0; + break; + + case 1: case 2: + case 3: case 4: + case 5: case 6: + // No deblock + for (int x = 0; x < 64; x++) + out[x] = grain[y][x] / 255.0; + break; + + default: pl_unreachable(); + } + + out += out_width; + } +} + +static void fill_grain_lut(void *data, const struct sh_lut_params *params) +{ + struct { + int8_t grain[64][64]; + int16_t tmp[64][64]; + } *tmp = pl_alloc_ptr(NULL, tmp); + + float *out = data; + assert(params->var_type == PL_VAR_FLOAT); + + for (int h = 0; h < 13; h++) { + for (int v = 0; v < 13; v++) { + float *slice = out + (h * 64) * params->width + (v * 64); + generate_slice(slice, params->width, h, v, tmp->grain, tmp->tmp); + } + } + + pl_free(tmp); +} + +bool pl_needs_fg_h274(const struct pl_film_grain_params *params) +{ + const struct pl_h274_grain_data *data = ¶ms->data.params.h274; + if (data->model_id != 0) + return false; + + for (int i = 0; i < 3; i++) { + enum pl_channel channel = channel_map(i, params); + if (channel < 0 || channel >= 3) + continue; + if (data->component_model_present[channel]) + return true; + } + + return false; +} + +bool pl_shader_fg_h274(pl_shader sh, pl_shader_obj *grain_state, + const struct pl_film_grain_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_NONE, params->tex->params.w, params->tex->params.h)) + return false; + + size_t shmem_req = 0; + ident_t group_sum = NULL_IDENT; + + const struct pl_glsl_version glsl = sh_glsl(sh); + if (glsl.subgroup_size < 8*8) { + group_sum = sh_fresh(sh, "group_sum"); + shmem_req += sizeof(int); + GLSLH("shared int "$"; \n", group_sum); + GLSL($" = 0; barrier(); \n", group_sum); + } + + if (!sh_try_compute(sh, 8, 8, false, shmem_req)) { + SH_FAIL(sh, "H.274 film grain synthesis requires compute shaders!"); + return false; + } + + ident_t db = sh_lut(sh, sh_lut_params( + .object = grain_state, + .var_type = PL_VAR_FLOAT, + .lut_type = SH_LUT_TEXTURE, + .width = 13 * 64, + .height = 13 * 64, + .comps = 1, + .fill = fill_grain_lut, + .signature = CACHE_KEY_H274, // doesn't depend on anything + .cache = SH_CACHE(sh), + )); + + sh_describe(sh, "H.274 film grain"); + GLSL("vec4 color; \n" + "// pl_shader_film_grain (H.274) \n" + "{ \n"); + + // Load the color value of the tex itself + ident_t tex = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->tex, + .desc = (struct pl_desc) { + .name = "tex", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + GLSL("ivec2 pos = ivec2(gl_GlobalInvocationID); \n" + "color = vec4("$") * texelFetch("$", pos, 0); \n", + SH_FLOAT(pl_color_repr_normalize(params->repr)), tex); + + const struct pl_h274_grain_data *data = ¶ms->data.params.h274; + ident_t scale_factor = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_float("scale_factor"), + .data = &(float){ 1.0 / (1 << (data->log2_scale_factor + 6)) }, + }); + + // pcg3d (http://www.jcgt.org/published/0009/03/02/) + GLSL("uvec3 pcg = uvec3("$", gl_WorkGroupID.xy / 2u); \n" + "pcg = pcg * 1664525u + 1013904223u; \n" + "pcg.x += pcg.y * pcg.z; \n" + "pcg.y += pcg.z * pcg.x; \n" + "pcg.z += pcg.x * pcg.y; \n" + "pcg ^= pcg >> 16u; \n" + "pcg.x += pcg.y * pcg.z; \n" + "pcg.y += pcg.z * pcg.x; \n" + "pcg.z += pcg.x * pcg.y; \n", + sh_var(sh, (struct pl_shader_var) { + .var = pl_var_uint("seed"), + .data = &(unsigned int){ params->data.seed }, + })); + + for (int idx = 0; idx < params->components; idx++) { + enum pl_channel c = channel_map(idx, params); + if (c == PL_CHANNEL_NONE) + continue; + if (!data->component_model_present[c]) + continue; + + GLSL("// component %d\n{\n", c); + + // Compute the local 8x8 average + GLSL("float avg = color[%d] / 64.0; \n", c); + + const int precision = 10000000; + if (glsl.subgroup_size) { + GLSL("avg = subgroupAdd(avg); \n"); + + if (glsl.subgroup_size < 8*8) { + GLSL("if (subgroupElect()) \n" + " atomicAdd("$", int(avg * %d.0)); \n" + "barrier(); \n" + "avg = float("$") / %d.0; \n", + group_sum, precision, group_sum, precision); + } + } else { + GLSL("atomicAdd("$", int(avg * %d.0)); \n" + "barrier(); \n" + "avg = float("$") / %d.0; \n", + group_sum, precision, group_sum, precision); + } + + // Hard-coded unrolled loop, to avoid having to load a dynamically + // sized array into the shader - and to optimize for the very common + // case of there only being a single intensity interval + GLSL("uint val; \n"); + for (int i = 0; i < data->num_intensity_intervals[c]; i++) { + ident_t bounds = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("bounds"), + .data = &(float[2]) { + data->intensity_interval_lower_bound[c][i] / 255.0, + data->intensity_interval_upper_bound[c][i] / 255.0, + }, + }); + + const uint8_t num_values = data->num_model_values[c]; + uint8_t h = num_values > 1 ? data->comp_model_value[c][i][1] : 8; + uint8_t v = num_values > 2 ? data->comp_model_value[c][i][2] : h; + h = PL_CLAMP(h, 2, 14) - 2; + v = PL_CLAMP(v, 2, 14) - 2; + // FIXME: double h/v for subsampled planes! + + // Reduce scale for chroma planes + int16_t scale = data->comp_model_value[c][i][0]; + if (c > 0 && pl_color_system_is_ycbcr_like(params->repr->sys)) + scale >>= 1; + + pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t)); + ident_t values = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_uint("comp_model_value"), + .data = &(unsigned int) { + (uint16_t) scale << 16 | h << 8 | v, + }, + }); + + GLSL("if (avg >= "$".x && avg <= "$".y) \n" + " val = "$"; else \n", + bounds, bounds, values); + } + GLSL(" val = 0u; \n"); + + // Extract the grain parameters from comp_model_value + GLSL("uvec2 offset = uvec2((val & 0xFF00u) >> 2, \n" + " (val & 0xFFu) << 6); \n" + "float scale = "$" * float(int(val >> 16)); \n" + // Add randomness + "uint rand = pcg[%d]; \n" + "offset.x += (rand >> 16u) %% 52u; \n" + "offset.y += (rand & 0xFFFFu) %% 56u; \n" + "offset.x &= 0xFFFCu; \n" + "offset.y &= 0xFFF8u; \n" + "if ((rand & 1u) == 1u) scale = -scale; \n" + // Add local offset and compute grain + "offset += 8u * (gl_WorkGroupID.xy %% 2u); \n" + "offset += gl_LocalInvocationID.xy; \n" + "float grain = "$"(offset); \n" + "color[%d] += scale * grain; \n", + scale_factor, c, db, c); + + // TODO: Deblocking? + + GLSL("}\n"); + } + + GLSL("} \n"); + return true; +} + +// These tables are all taken from the SMPTE RDD 5-2006 specification +static const int8_t Gaussian_LUT[2048+4] = { + -11, 12, 103, -11, 42, -35, 12, 59, 77, 98, -87, 3, 65, -78, 45, 56, -51, 21, + 13, -11, -20, -19, 33, -127, 17, -6, -105, 18, 19, 71, 48, -10, -38, 42, + -2, 75, -67, 52, -90, 33, -47, 21, -3, -56, 49, 1, -57, -42, -1, 120, -127, + -108, -49, 9, 14, 127, 122, 109, 52, 127, 2, 7, 114, 19, 30, 12, 77, 112, + 82, -61, -127, 111, -52, -29, 2, -49, -24, 58, -29, -73, 12, 112, 67, 79, + -3, -114, -87, -6, -5, 40, 58, -81, 49, -27, -31, -34, -105, 50, 16, -24, + -35, -14, -15, -127, -55, -22, -55, -127, -112, 5, -26, -72, 127, 127, -2, + 41, 87, -65, -16, 55, 19, 91, -81, -65, -64, 35, -7, -54, 99, -7, 88, 125, + -26, 91, 0, 63, 60, -14, -23, 113, -33, 116, 14, 26, 51, -16, 107, -8, 53, + 38, -34, 17, -7, 4, -91, 6, 63, 63, -15, 39, -36, 19, 55, 17, -51, 40, 33, + -37, 126, -39, -118, 17, -30, 0, 19, 98, 60, 101, -12, -73, -17, -52, 98, + 3, 3, 60, 33, -3, -2, 10, -42, -106, -38, 14, 127, 16, -127, -31, -86, -39, + -56, 46, -41, 75, 23, -19, -22, -70, 74, -54, -2, 32, -45, 17, -92, 59, + -64, -67, 56, -102, -29, -87, -34, -92, 68, 5, -74, -61, 93, -43, 14, -26, + -38, -126, -17, 16, -127, 64, 34, 31, 93, 17, -51, -59, 71, 77, 81, 127, + 127, 61, 33, -106, -93, 0, 0, 75, -69, 71, 127, -19, -111, 30, 23, 15, 2, + 39, 92, 5, 42, 2, -6, 38, 15, 114, -30, -37, 50, 44, 106, 27, 119, 7, -80, + 25, -68, -21, 92, -11, -1, 18, 41, -50, 79, -127, -43, 127, 18, 11, -21, + 32, -52, 27, -88, -90, -39, -19, -10, 24, -118, 72, -24, -44, 2, 12, 86, + -107, 39, -33, -127, 47, 51, -24, -22, 46, 0, 15, -35, -69, -2, -74, 24, + -6, 0, 29, -3, 45, 32, -32, 117, -45, 79, -24, -17, -109, -10, -70, 88, + -48, 24, -91, 120, -37, 50, -127, 58, 32, -82, -10, -17, -7, 46, -127, -15, + 89, 127, 17, 98, -39, -33, 37, 42, -40, -32, -21, 105, -19, 19, 19, -59, + -9, 30, 0, -127, 34, 127, -84, 75, 24, -40, -49, -127, -107, -14, 45, -75, + 1, 30, -20, 41, -68, -40, 12, 127, -3, 5, 20, -73, -59, -127, -3, -3, -53, + -6, -119, 93, 120, -80, -50, 0, 20, -46, 67, 78, -12, -22, -127, 36, -41, + 56, 119, -5, -116, -22, 68, -14, -90, 24, -82, -44, -127, 107, -25, -37, + 40, -7, -7, -82, 5, -87, 44, -34, 9, -127, 39, 70, 49, -63, 74, -49, 109, + -27, -89, -47, -39, 44, 49, -4, 60, -42, 80, 9, -127, -9, -56, -49, 125, + -66, 47, 36, 117, 15, -11, -96, 109, 94, -17, -56, 70, 8, -14, -5, 50, 37, + -45, 120, -30, -76, 40, -46, 6, 3, 69, 17, -78, 1, -79, 6, 127, 43, 26, + 127, -127, 28, -55, -26, 55, 112, 48, 107, -1, -77, -1, 53, -9, -22, -43, + 123, 108, 127, 102, 68, 46, 5, 1, 123, -13, -55, -34, -49, 89, 65, -105, + -5, 94, -53, 62, 45, 30, 46, 18, -35, 15, 41, 47, -98, -24, 94, -75, 127, + -114, 127, -68, 1, -17, 51, -95, 47, 12, 34, -45, -75, 89, -107, -9, -58, + -29, -109, -24, 127, -61, -13, 77, -45, 17, 19, 83, -24, 9, 127, -66, 54, + 4, 26, 13, 111, 43, -113, -22, 10, -24, 83, 67, -14, 75, -123, 59, 127, + -12, 99, -19, 64, -38, 54, 9, 7, 61, -56, 3, -57, 113, -104, -59, 3, -9, + -47, 74, 85, -55, -34, 12, 118, 28, 93, -72, 13, -99, -72, -20, 30, 72, + -94, 19, -54, 64, -12, -63, -25, 65, 72, -10, 127, 0, -127, 103, -20, -73, + -112, -103, -6, 28, -42, -21, -59, -29, -26, 19, -4, -51, 94, -58, -95, + -37, 35, 20, -69, 127, -19, -127, -22, -120, -53, 37, 74, -127, -1, -12, + -119, -53, -28, 38, 69, 17, 16, -114, 89, 62, 24, 37, -23, 49, -101, -32, + -9, -95, -53, 5, 93, -23, -49, -8, 51, 3, -75, -90, -10, -39, 127, -86, + -22, 20, 20, 113, 75, 52, -31, 92, -63, 7, -12, 46, 36, 101, -43, -17, -53, + -7, -38, -76, -31, -21, 62, 31, 62, 20, -127, 31, 64, 36, 102, -85, -10, + 77, 80, 58, -79, -8, 35, 8, 80, -24, -9, 3, -17, 72, 127, 83, -87, 55, 18, + -119, -123, 36, 10, 127, 56, -55, 113, 13, 26, 32, -13, -48, 22, -13, 5, + 58, 27, 24, 26, -11, -36, 37, -92, 78, 81, 9, 51, 14, 67, -13, 0, 32, 45, + -76, 32, -39, -22, -49, -127, -27, 31, -9, 36, 14, 71, 13, 57, 12, -53, + -86, 53, -44, -35, 2, 127, 12, -66, -44, 46, -115, 3, 10, 56, -35, 119, + -19, -61, 52, -59, -127, -49, -23, 4, -5, 17, -82, -6, 127, 25, 79, 67, 64, + -25, 14, -64, -37, -127, -28, 21, -63, 66, -53, -41, 109, -62, 15, -22, 13, + 29, -63, 20, 27, 95, -44, -59, -116, -10, 79, -49, 22, -43, -16, 46, -47, + -120, -36, -29, -52, -44, 29, 127, -13, 49, -9, -127, 75, -28, -23, 88, 59, + 11, -95, 81, -59, 58, 60, -26, 40, -92, -3, -22, -58, -45, -59, -22, -53, + 71, -29, 66, -32, -23, 14, -17, -66, -24, -28, -62, 47, 38, 17, 16, -37, + -24, -11, 8, -27, -19, 59, 45, -49, -47, -4, -22, -81, 30, -67, -127, 74, + 102, 5, -18, 98, 34, -66, 42, -52, 7, -59, 24, -58, -19, -24, -118, -73, + 91, 15, -16, 79, -32, -79, -127, -36, 41, 77, -83, 2, 56, 22, -75, 127, + -16, -21, 12, 31, 56, -113, -127, 90, 55, 61, 12, 55, -14, -113, -14, 32, + 49, -67, -17, 91, -10, 1, 21, 69, -70, 99, -19, -112, 66, -90, -10, -9, + -71, 127, 50, -81, -49, 24, 61, -61, -111, 7, -41, 127, 88, -66, 108, -127, + -6, 36, -14, 41, -50, 14, 14, 73, -101, -28, 77, 127, -8, -100, 88, 38, + 121, 88, -125, -60, 13, -94, -115, 20, -67, -87, -94, -119, 44, -28, -30, + 18, 5, -53, -61, 20, -43, 11, -77, -60, 13, 29, 3, 6, -72, 38, -60, -11, + 108, -53, 41, 66, -12, -127, -127, -49, 24, 29, 46, 36, 91, 34, -33, 116, + -51, -34, -52, 91, 7, -83, 73, -26, -103, 24, -10, 76, 84, 5, 68, -80, -13, + -17, -32, -48, 20, 50, 26, 10, 63, -104, -14, 37, 127, 114, 97, 35, 1, -33, + -55, 127, -124, -33, 61, -7, 119, -32, -127, -53, -42, 63, 3, -5, -26, 70, + -58, -33, -44, -43, 34, -56, -127, 127, 25, -35, -11, 16, -81, 29, -58, 40, + -127, -127, 20, -47, -11, -36, -63, -52, -32, -82, 78, -76, -73, 8, 27, + -72, -9, -74, -85, -86, -57, 25, 78, -10, -97, 35, -65, 8, -59, 14, 1, -42, + 32, -88, -44, 17, -3, -9, 59, 40, 12, -108, -40, 24, 34, 18, -28, 2, 51, + -110, -4, 100, 1, 65, 22, 0, 127, 61, 45, 25, -31, 6, 9, -7, -48, 99, 16, + 44, -2, -40, 32, -39, -52, 10, -110, -19, 56, -127, 69, 26, 51, 92, 40, 61, + -52, 45, -38, 13, 85, 122, 27, 66, 45, -111, -83, -3, 31, 37, 19, -36, 58, + 71, 39, -78, -47, 58, -78, 8, -62, -36, -14, 61, 42, -127, 71, -4, 24, -54, + 52, -127, 67, -4, -42, 30, -63, 59, -3, -1, -18, -46, -92, -81, -96, -14, + -53, -10, -11, -77, 13, 1, 8, -67, -127, 127, -28, 26, -14, 18, -13, -26, + 2, 10, -46, -32, -15, 27, -31, -59, 59, 77, -121, 28, 40, -54, -62, -31, + -21, -37, -32, -6, -127, -25, -60, 70, -127, 112, -127, 127, 88, -7, 116, + 110, 53, 87, -127, 3, 16, 23, 74, -106, -51, 3, 74, -82, -112, -74, 65, 81, + 25, 53, 127, -45, -50, -103, -41, -65, -29, 79, -67, 64, -33, -30, -8, 127, + 0, -13, -51, 67, -14, 5, -92, 29, -35, -8, -90, -57, -3, 36, 43, 44, -31, + -69, -7, 36, 39, -51, 43, -81, 58, 6, 127, 12, 57, 66, 46, 59, -43, -42, + 41, -15, -120, 24, 3, -11, 19, -13, 51, 28, 3, 55, -48, -12, -1, 2, 97, + -19, 29, 42, 13, 43, 78, -44, 56, -108, -43, -19, 127, 15, -11, -18, -81, + 83, -37, 77, -109, 15, 65, -50, 43, 12, 13, 27, 28, 61, 57, 30, 26, 106, + -18, 56, 13, 97, 4, -8, -62, -103, 94, 108, -44, 52, 27, -47, -9, 105, -53, + 46, 89, 103, -33, 38, -34, 55, 51, 70, -94, -35, -87, -107, -19, -31, 9, + -19, 79, -14, 77, 5, -19, -107, 85, 21, -45, -39, -42, 9, -29, 74, 47, -75, + 60, -127, 120, -112, -57, -32, 41, 7, 79, 76, 66, 57, 41, -25, 31, 37, -47, + -36, 43, -73, -37, 63, 127, -69, -52, 90, -33, -61, 60, -55, 44, 15, 4, + -67, 13, -92, 64, 29, -39, -3, 83, -2, -38, -85, -86, 58, 35, -69, -61, 29, + -37, -95, -78, 4, 30, -4, -32, -80, -22, -9, -77, 46, 7, -93, -71, 65, 9, + -50, 127, -70, 26, -12, -39, -114, 63, -127, -100, 4, -32, 111, 22, -60, + 65, -101, 26, -42, 21, -59, -27, -74, 2, -94, 6, 126, 5, 76, -88, -9, -43, + -101, 127, 1, 125, 92, -63, 52, 56, 4, 81, -127, 127, 80, 127, -29, 30, + 116, -74, -17, -57, 105, 48, 45, 25, -72, 48, -38, -108, 31, -34, 4, -11, + 41, -127, 52, -104, -43, -37, 52, 2, 47, 87, -9, 77, 27, -41, -25, 90, 86, + -56, 75, 10, 33, 78, 58, 127, 127, -7, -73, 49, -33, -106, -35, 38, 57, 53, + -17, -4, 83, 52, -108, 54, -125, 28, 23, 56, -43, -88, -17, -6, 47, 23, -9, + 0, -13, 111, 75, 27, -52, -38, -34, 39, 30, 66, 39, 38, -64, 38, 3, 21, + -32, -51, -28, 54, -38, -87, 20, 52, 115, 18, -81, -70, 0, -14, -46, -46, + -3, 125, 16, -14, 23, -82, -84, -69, -20, -65, -127, 9, 81, -49, 61, 7, + -36, -45, -42, 57, -26, 47, 20, -85, 46, -13, 41, -37, -75, -60, 86, -78, + -127, 12, 50, 2, -3, 13, 47, 5, 19, -78, -55, -27, 65, -71, 12, -108, 20, + -16, 11, -31, 63, -55, 37, 75, -17, 127, -73, -33, -28, -120, 105, 68, 106, + -103, -106, 71, 61, 2, 23, -3, 33, -5, -15, -67, -15, -23, -54, 15, -63, + 76, 58, -110, 1, 83, -27, 22, 75, -39, -17, -11, 64, -17, -127, -54, -66, + 31, 96, 116, 3, -114, -7, -108, -63, 97, 9, 50, 8, 75, -28, 72, 112, -36, + -112, 95, -50, 23, -13, -19, 55, 21, 23, 92, 91, 22, -49, 16, -75, 23, 9, + -49, -97, -37, 49, -36, 36, -127, -86, 43, 127, -24, -24, 84, 83, -35, -34, + -12, 109, 102, -38, 51, -68, 34, 19, -22, 49, -32, 127, 40, 24, -93, -4, + -3, 105, 3, -58, -18, 8, 127, -18, 125, 68, 69, -62, 30, -36, 54, -57, -24, + 17, 43, -36, -27, -57, -67, -21, -10, -49, 68, 12, 65, 4, 48, 55, 127, -75, + 44, 89, -66, -13, -78, -82, -91, 22, 30, 33, -40, -87, -34, 96, -91, 39, + 10, -64, -3, -12, 127, -50, -37, -56, 23, -35, -36, -54, 90, -91, 2, 50, + 77, -6, -127, 16, 46, -5, -73, 0, -56, -18, -72, 28, 93, 60, 49, 20, 18, + 111, -111, 32, -83, 47, 47, -10, 35, -88, 43, 57, -98, 127, -17, 0, 1, -39, + -127, -2, 0, 63, 93, 0, 36, -66, -61, -19, 39, -127, 58, 50, -17, 127, 88, + -43, -108, -51, -16, 7, -36, 68, 46, -14, 107, 40, 57, 7, 19, 8, 3, 88, + -90, -92, -18, -21, -24, 13, 7, -4, -78, -91, -4, 8, -35, -5, 19, 2, -111, + 4, -66, -81, 122, -20, -34, -37, -84, 127, 68, 46, 17, 47, + + // Repeat the beginning of the array to allow wrapping reads + -11, 12, 103, -11, +}; + +static const uint32_t Seed_LUT[256] = { + 747538460, 1088979410, 1744950180, 1767011913, 1403382928, + 521866116, 1060417601, 2110622736, 1557184770, 105289385, 585624216, + 1827676546, 1191843873, 1018104344, 1123590530, 663361569, 2023850500, + 76561770, 1226763489, 80325252, 1992581442, 502705249, 740409860, + 516219202, 557974537, 1883843076, 720112066, 1640137737, 1820967556, + 40667586, 155354121, 1820967557, 1115949072, 1631803309, 98284748, + 287433856, 2119719977, 988742797, 1827432592, 579378475, 1017745956, + 1309377032, 1316535465, 2074315269, 1923385360, 209722667, 1546228260, + 168102420, 135274561, 355958469, 248291472, 2127839491, 146920100, + 585982612, 1611702337, 696506029, 1386498192, 1258072451, 1212240548, + 1043171860, 1217404993, 1090770605, 1386498193, 169093201, 541098240, + 1468005469, 456510673, 1578687785, 1838217424, 2010752065, 2089828354, + 1362717428, 970073673, 854129835, 714793201, 1266069081, 1047060864, + 1991471829, 1098097741, 913883585, 1669598224, 1337918685, 1219264706, + 1799741108, 1834116681, 683417731, 1120274457, 1073098457, 1648396544, + 176642749, 31171789, 718317889, 1266977808, 1400892508, 549749008, + 1808010512, 67112961, 1005669825, 903663673, 1771104465, 1277749632, + 1229754427, 950632997, 1979371465, 2074373264, 305357524, 1049387408, + 1171033360, 1686114305, 2147468765, 1941195985, 117709841, 809550080, + 991480851, 1816248997, 1561503561, 329575568, 780651196, 1659144592, + 1910793616, 604016641, 1665084765, 1530186961, 1870928913, 809550081, + 2079346113, 71307521, 876663040, 1073807360, 832356664, 1573927377, + 204073344, 2026918147, 1702476788, 2043881033, 57949587, 2001393952, + 1197426649, 1186508931, 332056865, 950043140, 890043474, 349099312, + 148914948, 236204097, 2022643605, 1441981517, 498130129, 1443421481, + 924216797, 1817491777, 1913146664, 1411989632, 929068432, 495735097, + 1684636033, 1284520017, 432816184, 1344884865, 210843729, 676364544, + 234449232, 12112337, 1350619139, 1753272996, 2037118872, 1408560528, + 533334916, 1043640385, 357326099, 201376421, 110375493, 541106497, + 416159637, 242512193, 777294080, 1614872576, 1535546636, 870600145, + 910810409, 1821440209, 1605432464, 1145147393, 951695441, 1758494976, + 1506656568, 1557150160, 608221521, 1073840384, 217672017, 684818688, + 1750138880, 16777217, 677990609, 953274371, 1770050213, 1359128393, + 1797602707, 1984616737, 1865815816, 2120835200, 2051677060, 1772234061, + 1579794881, 1652821009, 1742099468, 1887260865, 46468113, 1011925248, + 1134107920, 881643832, 1354774993, 472508800, 1892499769, 1752793472, + 1962502272, 687898625, 883538000, 1354355153, 1761673473, 944820481, + 2020102353, 22020353, 961597696, 1342242816, 964808962, 1355809701, + 17016649, 1386540177, 647682692, 1849012289, 751668241, 1557184768, + 127374604, 1927564752, 1045744913, 1614921984, 43588881, 1016185088, + 1544617984, 1090519041, 136122424, 215038417, 1563027841, 2026918145, + 1688778833, 701530369, 1372639488, 1342242817, 2036945104, 953274369, + 1750192384, 16842753, 964808960, 1359020032, 1358954497 +}; + +// Note: This is pre-transposed, i.e. stored column-major order +static const int8_t R64T[64][64] = { + { + 32, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 42, + 42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 36, 35, 34, 34, 33, + 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, + 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1, + }, { + 32, 45, 45, 44, 43, 42, 41, 39, 38, 36, 34, 31, 29, 26, 23, 20, + 17, 14, 11, 8, 4, 1, -2, -6, -9, -12, -15, -18, -21, -24, -27, -30, + -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43, + -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10, -7, -3, + }, { + 32, 45, 44, 42, 40, 37, 34, 30, 25, 20, 15, 10, 4, -1, -7, -12, + -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36, + -32, -28, -23, -18, -13, -8, -2, 3, 9, 14, 19, 24, 29, 33, 36, 39, + 42, 44, 45, 45, 45, 44, 43, 40, 38, 34, 30, 26, 21, 16, 11, 6, + }, { + 32, 45, 43, 39, 35, 30, 23, 16, 9, 1, -7, -14, -21, -28, -34, -38, + -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11, -3, 4, 12, 19, 26, + 32, 37, 41, 44, 45, 45, 44, 41, 38, 33, 27, 20, 13, 6, -2, -10, + -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15, -8, + }, { + 32, 44, 41, 36, 29, 20, 11, 1, -9, -18, -27, -34, -40, -44, -45, -45, + -42, -37, -30, -22, -13, -3, 7, 16, 25, 33, 39, 43, 45, 45, 43, 38, + 32, 24, 15, 6, -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26, + -17, -8, 2, 12, 21, 30, 36, 41, 44, 45, 44, 40, 35, 28, 19, 10, + }, { + 32, 44, 39, 31, 21, 10, -2, -14, -25, -34, -41, -45, -45, -42, -36, -28, + -17, -6, 7, 18, 29, 37, 43, 45, 44, 40, 34, 24, 13, 1, -11, -22, + -32, -39, -44, -45, -43, -38, -30, -20, -9, 3, 15, 26, 35, 41, 45, 45, + 42, 36, 27, 16, 4, -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12, + }, { + 32, 43, 36, 26, 13, -1, -15, -28, -38, -44, -45, -42, -35, -24, -11, 3, + 17, 30, 39, 44, 45, 41, 34, 22, 9, -6, -19, -31, -40, -45, -45, -40, + -32, -20, -7, 8, 21, 33, 41, 45, 44, 39, 30, 18, 4, -10, -23, -34, + -42, -45, -44, -38, -29, -16, -2, 12, 25, 36, 43, 45, 43, 37, 27, 14, + }, { + 32, 42, 34, 20, 4, -12, -27, -38, -44, -45, -39, -28, -13, 3, 19, 33, + 42, 45, 43, 34, 21, 6, -11, -26, -38, -44, -45, -39, -29, -14, 2, 18, + 32, 41, 45, 43, 35, 22, 7, -10, -25, -37, -44, -45, -40, -30, -15, 1, + 17, 31, 41, 45, 43, 36, 23, 8, -9, -24, -36, -44, -45, -40, -30, -16, + }, { + 32, 41, 30, 14, -4, -22, -36, -44, -44, -37, -23, -6, 13, 30, 41, 45, + 42, 31, 15, -3, -21, -36, -44, -45, -38, -24, -7, 12, 29, 40, 45, 42, + 32, 16, -2, -20, -35, -44, -45, -38, -25, -8, 11, 28, 40, 45, 43, 33, + 17, -1, -19, -34, -43, -45, -39, -26, -9, 10, 27, 39, 45, 43, 34, 18, + }, { + 32, 40, 27, 8, -13, -31, -43, -45, -38, -22, -2, 18, 35, 44, 44, 34, + 17, -3, -23, -38, -45, -42, -30, -12, 9, 28, 41, 45, 40, 26, 7, -14, + -32, -43, -45, -37, -21, -1, 19, 36, 44, 44, 34, 16, -4, -24, -39, -45, + -42, -30, -11, 10, 29, 41, 45, 39, 25, 6, -15, -33, -43, -45, -36, -20, + }, { + 32, 39, 23, 1, -21, -38, -45, -40, -25, -3, 19, 37, 45, 41, 27, 6, + -17, -36, -45, -42, -29, -8, 15, 34, 44, 43, 30, 10, -13, -33, -44, -44, + -32, -12, 11, 31, 43, 44, 34, 14, -9, -30, -43, -45, -35, -16, 7, 28, + 42, 45, 36, 18, -4, -26, -41, -45, -38, -20, 2, 24, 40, 45, 39, 22, + }, { + 32, 38, 19, -6, -29, -43, -44, -31, -9, 16, 36, 45, 40, 22, -2, -26, + -42, -45, -34, -12, 13, 34, 45, 41, 25, 1, -23, -40, -45, -36, -15, 10, + 32, 44, 43, 28, 4, -20, -39, -45, -38, -18, 7, 30, 43, 44, 30, 8, + -17, -37, -45, -39, -21, 3, 27, 42, 44, 33, 11, -14, -35, -45, -41, -24, + }, { + 32, 37, 15, -12, -35, -45, -39, -18, 9, 33, 45, 40, 21, -6, -30, -44, + -42, -24, 2, 28, 43, 43, 27, 1, -25, -42, -44, -30, -4, 22, 41, 45, + 32, 8, -19, -39, -45, -34, -11, 16, 38, 45, 36, 14, -13, -36, -45, -38, + -17, 10, 34, 45, 40, 20, -7, -31, -44, -41, -23, 3, 29, 44, 43, 26, + }, { + 32, 36, 11, -18, -40, -45, -30, -3, 25, 43, 43, 24, -4, -31, -45, -39, + -17, 12, 36, 45, 35, 10, -19, -40, -44, -30, -2, 26, 43, 42, 23, -6, + -32, -45, -39, -16, 13, 37, 45, 34, 9, -20, -41, -44, -29, -1, 27, 44, + 42, 22, -7, -33, -45, -38, -15, 14, 38, 45, 34, 8, -21, -41, -44, -28, + }, { + 32, 34, 7, -24, -43, -41, -19, 12, 38, 45, 30, 1, -29, -45, -39, -14, + 17, 40, 44, 26, -4, -33, -45, -36, -9, 22, 43, 42, 21, -10, -36, -45, + -32, -3, 27, 44, 40, 16, -15, -39, -44, -28, 2, 31, 45, 37, 11, -20, + -42, -43, -23, 8, 35, 45, 34, 6, -25, -44, -41, -18, 13, 38, 45, 30, + }, { + 32, 33, 2, -30, -45, -36, -7, 26, 44, 38, 11, -22, -43, -40, -15, 18, + 42, 42, 19, -14, -40, -44, -23, 10, 38, 45, 27, -6, -35, -45, -30, 1, + 32, 45, 34, 3, -29, -45, -36, -8, 25, 44, 39, 12, -21, -43, -41, -16, + 17, 41, 43, 20, -13, -39, -44, -24, 9, 37, 45, 28, -4, -34, -45, -31, + }, { + 32, 31, -2, -34, -45, -28, 7, 37, 44, 24, -11, -39, -43, -20, 15, 41, + 42, 16, -19, -43, -40, -12, 23, 44, 38, 8, -27, -45, -35, -3, 30, 45, + 32, -1, -34, -45, -29, 6, 36, 45, 25, -10, -39, -44, -21, 14, 41, 42, + 17, -18, -43, -40, -13, 22, 44, 38, 9, -26, -45, -36, -4, 30, 45, 33, + }, { + 32, 30, -7, -38, -43, -18, 19, 44, 38, 6, -30, -45, -29, 8, 39, 43, + 17, -20, -44, -37, -4, 31, 45, 28, -9, -39, -43, -16, 21, 44, 36, 3, + -32, -45, -27, 10, 40, 42, 15, -22, -44, -36, -2, 33, 45, 26, -11, -40, + -42, -14, 23, 45, 35, 1, -34, -45, -25, 12, 41, 41, 13, -24, -45, -34, + }, { + 32, 28, -11, -41, -40, -8, 30, 45, 25, -14, -43, -38, -4, 33, 45, 22, + -17, -44, -36, -1, 35, 44, 19, -20, -44, -34, 2, 37, 43, 16, -23, -45, + -32, 6, 39, 42, 13, -26, -45, -30, 9, 40, 41, 10, -29, -45, -27, 12, + 42, 39, 7, -31, -45, -24, 15, 43, 38, 3, -34, -45, -21, 18, 44, 36, + }, { + 32, 26, -15, -44, -35, 3, 39, 41, 9, -31, -45, -20, 21, 45, 30, -10, + -42, -38, -2, 36, 43, 14, -27, -45, -25, 16, 44, 34, -4, -39, -41, -8, + 32, 45, 19, -22, -45, -30, 11, 42, 38, 1, -36, -43, -13, 28, 45, 24, + -17, -44, -34, 6, 40, 40, 7, -33, -44, -18, 23, 45, 29, -12, -43, -37, + }, { + 32, 24, -19, -45, -29, 14, 44, 33, -9, -42, -36, 3, 40, 39, 2, -37, + -42, -8, 34, 44, 13, -30, -45, -18, 25, 45, 23, -20, -45, -28, 15, 44, + 32, -10, -43, -36, 4, 40, 39, 1, -38, -41, -7, 34, 43, 12, -30, -45, + -17, 26, 45, 22, -21, -45, -27, 16, 44, 31, -11, -43, -35, 6, 41, 38, + }, { + 32, 22, -23, -45, -21, 24, 45, 20, -25, -45, -19, 26, 45, 18, -27, -45, + -17, 28, 45, 16, -29, -45, -15, 30, 44, 14, -30, -44, -13, 31, 44, 12, + -32, -44, -11, 33, 43, 10, -34, -43, -9, 34, 43, 8, -35, -42, -7, 36, + 42, 6, -36, -41, -4, 37, 41, 3, -38, -40, -2, 38, 40, 1, -39, -39, + }, { + 32, 20, -27, -45, -13, 33, 43, 6, -38, -39, 2, 41, 35, -10, -44, -30, + 17, 45, 23, -24, -45, -16, 30, 44, 9, -36, -41, -1, 40, 37, -7, -43, + -32, 14, 45, 26, -21, -45, -19, 28, 44, 12, -34, -42, -4, 38, 39, -3, + -42, -34, 11, 44, 29, -18, -45, -22, 25, 45, 15, -31, -43, -8, 36, 40, + }, { + 32, 18, -30, -43, -4, 39, 36, -10, -44, -26, 23, 45, 13, -34, -41, 1, + 42, 33, -15, -45, -21, 28, 44, 8, -38, -38, 7, 44, 29, -20, -45, -16, + 32, 42, 2, -40, -35, 12, 45, 24, -25, -45, -11, 36, 40, -3, -43, -31, + 17, 45, 19, -30, -43, -6, 39, 37, -9, -44, -27, 22, 45, 14, -34, -41, + }, { + 32, 16, -34, -40, 4, 44, 27, -24, -44, -8, 39, 36, -13, -45, -19, 31, + 42, -1, -43, -30, 21, 45, 11, -37, -38, 10, 45, 22, -29, -43, -2, 41, + 32, -18, -45, -14, 35, 39, -7, -44, -25, 26, 44, 6, -40, -34, 15, 45, + 17, -33, -41, 3, 43, 28, -23, -45, -9, 38, 36, -12, -45, -20, 30, 42, + }, { + 32, 14, -36, -37, 13, 45, 15, -36, -38, 12, 45, 16, -35, -38, 11, 45, + 17, -34, -39, 10, 45, 18, -34, -39, 9, 45, 19, -33, -40, 8, 45, 20, + -32, -40, 7, 45, 21, -31, -41, 6, 44, 22, -30, -41, 4, 44, 23, -30, + -42, 3, 44, 24, -29, -42, 2, 44, 25, -28, -43, 1, 43, 26, -27, -43, + }, { + 32, 12, -39, -33, 21, 44, 2, -43, -25, 30, 41, -8, -45, -16, 36, 36, + -17, -45, -7, 41, 29, -26, -43, 3, 44, 20, -34, -38, 13, 45, 11, -39, + -32, 22, 44, 1, -43, -24, 30, 40, -9, -45, -15, 37, 35, -18, -45, -6, + 42, 28, -27, -42, 4, 45, 19, -34, -38, 14, 45, 10, -40, -31, 23, 44, + }, { + 32, 10, -41, -28, 29, 40, -11, -45, -9, 41, 27, -30, -40, 12, 45, 8, + -42, -26, 30, 39, -13, -45, -7, 42, 25, -31, -39, 14, 45, 6, -43, -24, + 32, 38, -15, -45, -4, 43, 23, -33, -38, 16, 45, 3, -43, -22, 34, 37, + -17, -45, -2, 44, 21, -34, -36, 18, 44, 1, -44, -20, 35, 36, -19, -44, + }, { + 32, 8, -43, -22, 35, 34, -23, -42, 9, 45, 7, -43, -21, 36, 34, -24, + -42, 10, 45, 6, -43, -20, 36, 33, -25, -41, 11, 45, 4, -44, -19, 37, + 32, -26, -41, 12, 45, 3, -44, -18, 38, 31, -27, -40, 13, 45, 2, -44, + -17, 38, 30, -28, -40, 14, 45, 1, -44, -16, 39, 30, -29, -39, 15, 45, + }, { + 32, 6, -44, -16, 40, 26, -34, -34, 25, 40, -15, -44, 4, 45, 7, -44, + -17, 39, 27, -33, -35, 24, 41, -14, -44, 3, 45, 8, -43, -18, 39, 28, + -32, -36, 23, 41, -13, -45, 2, 45, 9, -43, -19, 38, 29, -31, -36, 22, + 42, -12, -45, 1, 45, 10, -43, -20, 38, 30, -30, -37, 21, 42, -11, -45, + }, { + 32, 3, -45, -10, 43, 16, -41, -22, 38, 28, -34, -33, 29, 37, -23, -40, + 17, 43, -11, -45, 4, 45, 2, -45, -9, 44, 15, -41, -21, 38, 27, -34, + -32, 30, 36, -24, -40, 18, 43, -12, -44, 6, 45, 1, -45, -8, 44, 14, + -42, -20, 39, 26, -35, -31, 30, 36, -25, -39, 19, 42, -13, -44, 7, 45, + }, { + 32, 1, -45, -3, 45, 6, -45, -8, 44, 10, -44, -12, 43, 14, -43, -16, + 42, 18, -41, -20, 40, 22, -39, -24, 38, 26, -36, -28, 35, 30, -34, -31, + 32, 33, -30, -34, 29, 36, -27, -37, 25, 38, -23, -39, 21, 40, -19, -41, + 17, 42, -15, -43, 13, 44, -11, -44, 9, 45, -7, -45, 4, 45, -2, -45, + }, { + 32, -1, -45, 3, 45, -6, -45, 8, 44, -10, -44, 12, 43, -14, -43, 16, + 42, -18, -41, 20, 40, -22, -39, 24, 38, -26, -36, 28, 35, -30, -34, 31, + 32, -33, -30, 34, 29, -36, -27, 37, 25, -38, -23, 39, 21, -40, -19, 41, + 17, -42, -15, 43, 13, -44, -11, 44, 9, -45, -7, 45, 4, -45, -2, 45, + }, { + 32, -3, -45, 10, 43, -16, -41, 22, 38, -28, -34, 33, 29, -37, -23, 40, + 17, -43, -11, 45, 4, -45, 2, 45, -9, -44, 15, 41, -21, -38, 27, 34, + -32, -30, 36, 24, -40, -18, 43, 12, -44, -6, 45, -1, -45, 8, 44, -14, + -42, 20, 39, -26, -35, 31, 30, -36, -25, 39, 19, -42, -13, 44, 7, -45, + }, { + 32, -6, -44, 16, 40, -26, -34, 34, 25, -40, -15, 44, 4, -45, 7, 44, + -17, -39, 27, 33, -35, -24, 41, 14, -44, -3, 45, -8, -43, 18, 39, -28, + -32, 36, 23, -41, -13, 45, 2, -45, 9, 43, -19, -38, 29, 31, -36, -22, + 42, 12, -45, -1, 45, -10, -43, 20, 38, -30, -30, 37, 21, -42, -11, 45, + }, { + 32, -8, -43, 22, 35, -34, -23, 42, 9, -45, 7, 43, -21, -36, 34, 24, + -42, -10, 45, -6, -43, 20, 36, -33, -25, 41, 11, -45, 4, 44, -19, -37, + 32, 26, -41, -12, 45, -3, -44, 18, 38, -31, -27, 40, 13, -45, 2, 44, + -17, -38, 30, 28, -40, -14, 45, -1, -44, 16, 39, -30, -29, 39, 15, -45, + }, { + 32, -10, -41, 28, 29, -40, -11, 45, -9, -41, 27, 30, -40, -12, 45, -8, + -42, 26, 30, -39, -13, 45, -7, -42, 25, 31, -39, -14, 45, -6, -43, 24, + 32, -38, -15, 45, -4, -43, 23, 33, -38, -16, 45, -3, -43, 22, 34, -37, + -17, 45, -2, -44, 21, 34, -36, -18, 44, -1, -44, 20, 35, -36, -19, 44, + }, { + 32, -12, -39, 33, 21, -44, 2, 43, -25, -30, 41, 8, -45, 16, 36, -36, + -17, 45, -7, -41, 29, 26, -43, -3, 44, -20, -34, 38, 13, -45, 11, 39, + -32, -22, 44, -1, -43, 24, 30, -40, -9, 45, -15, -37, 35, 18, -45, 6, + 42, -28, -27, 42, 4, -45, 19, 34, -38, -14, 45, -10, -40, 31, 23, -44, + }, { + 32, -14, -36, 37, 13, -45, 15, 36, -38, -12, 45, -16, -35, 38, 11, -45, + 17, 34, -39, -10, 45, -18, -34, 39, 9, -45, 19, 33, -40, -8, 45, -20, + -32, 40, 7, -45, 21, 31, -41, -6, 44, -22, -30, 41, 4, -44, 23, 30, + -42, -3, 44, -24, -29, 42, 2, -44, 25, 28, -43, -1, 43, -26, -27, 43, + }, { + 32, -16, -34, 40, 4, -44, 27, 24, -44, 8, 39, -36, -13, 45, -19, -31, + 42, 1, -43, 30, 21, -45, 11, 37, -38, -10, 45, -22, -29, 43, -2, -41, + 32, 18, -45, 14, 35, -39, -7, 44, -25, -26, 44, -6, -40, 34, 15, -45, + 17, 33, -41, -3, 43, -28, -23, 45, -9, -38, 36, 12, -45, 20, 30, -42, + }, { + 32, -18, -30, 43, -4, -39, 36, 10, -44, 26, 23, -45, 13, 34, -41, -1, + 42, -33, -15, 45, -21, -28, 44, -8, -38, 38, 7, -44, 29, 20, -45, 16, + 32, -42, 2, 40, -35, -12, 45, -24, -25, 45, -11, -36, 40, 3, -43, 31, + 17, -45, 19, 30, -43, 6, 39, -37, -9, 44, -27, -22, 45, -14, -34, 41, + }, { + 32, -20, -27, 45, -13, -33, 43, -6, -38, 39, 2, -41, 35, 10, -44, 30, + 17, -45, 23, 24, -45, 16, 30, -44, 9, 36, -41, 1, 40, -37, -7, 43, + -32, -14, 45, -26, -21, 45, -19, -28, 44, -12, -34, 42, -4, -38, 39, 3, + -42, 34, 11, -44, 29, 18, -45, 22, 25, -45, 15, 31, -43, 8, 36, -40, + }, { + 32, -22, -23, 45, -21, -24, 45, -20, -25, 45, -19, -26, 45, -18, -27, 45, + -17, -28, 45, -16, -29, 45, -15, -30, 44, -14, -30, 44, -13, -31, 44, -12, + -32, 44, -11, -33, 43, -10, -34, 43, -9, -34, 43, -8, -35, 42, -7, -36, + 42, -6, -36, 41, -4, -37, 41, -3, -38, 40, -2, -38, 40, -1, -39, 39, + }, { + 32, -24, -19, 45, -29, -14, 44, -33, -9, 42, -36, -3, 40, -39, 2, 37, + -42, 8, 34, -44, 13, 30, -45, 18, 25, -45, 23, 20, -45, 28, 15, -44, + 32, 10, -43, 36, 4, -40, 39, -1, -38, 41, -7, -34, 43, -12, -30, 45, + -17, -26, 45, -22, -21, 45, -27, -16, 44, -31, -11, 43, -35, -6, 41, -38, + }, { + 32, -26, -15, 44, -35, -3, 39, -41, 9, 31, -45, 20, 21, -45, 30, 10, + -42, 38, -2, -36, 43, -14, -27, 45, -25, -16, 44, -34, -4, 39, -41, 8, + 32, -45, 19, 22, -45, 30, 11, -42, 38, -1, -36, 43, -13, -28, 45, -24, + -17, 44, -34, -6, 40, -40, 7, 33, -44, 18, 23, -45, 29, 12, -43, 37, + }, { + 32, -28, -11, 41, -40, 8, 30, -45, 25, 14, -43, 38, -4, -33, 45, -22, + -17, 44, -36, 1, 35, -44, 19, 20, -44, 34, 2, -37, 43, -16, -23, 45, + -32, -6, 39, -42, 13, 26, -45, 30, 9, -40, 41, -10, -29, 45, -27, -12, + 42, -39, 7, 31, -45, 24, 15, -43, 38, -3, -34, 45, -21, -18, 44, -36, + }, { + 32, -30, -7, 38, -43, 18, 19, -44, 38, -6, -30, 45, -29, -8, 39, -43, + 17, 20, -44, 37, -4, -31, 45, -28, -9, 39, -43, 16, 21, -44, 36, -3, + -32, 45, -27, -10, 40, -42, 15, 22, -44, 36, -2, -33, 45, -26, -11, 40, + -42, 14, 23, -45, 35, -1, -34, 45, -25, -12, 41, -41, 13, 24, -45, 34, + }, { + 32, -31, -2, 34, -45, 28, 7, -37, 44, -24, -11, 39, -43, 20, 15, -41, + 42, -16, -19, 43, -40, 12, 23, -44, 38, -8, -27, 45, -35, 3, 30, -45, + 32, 1, -34, 45, -29, -6, 36, -45, 25, 10, -39, 44, -21, -14, 41, -42, + 17, 18, -43, 40, -13, -22, 44, -38, 9, 26, -45, 36, -4, -30, 45, -33, + }, { + 32, -33, 2, 30, -45, 36, -7, -26, 44, -38, 11, 22, -43, 40, -15, -18, + 42, -42, 19, 14, -40, 44, -23, -10, 38, -45, 27, 6, -35, 45, -30, -1, + 32, -45, 34, -3, -29, 45, -36, 8, 25, -44, 39, -12, -21, 43, -41, 16, + 17, -41, 43, -20, -13, 39, -44, 24, 9, -37, 45, -28, -4, 34, -45, 31, + }, { + 32, -34, 7, 24, -43, 41, -19, -12, 38, -45, 30, -1, -29, 45, -39, 14, + 17, -40, 44, -26, -4, 33, -45, 36, -9, -22, 43, -42, 21, 10, -36, 45, + -32, 3, 27, -44, 40, -16, -15, 39, -44, 28, 2, -31, 45, -37, 11, 20, + -42, 43, -23, -8, 35, -45, 34, -6, -25, 44, -41, 18, 13, -38, 45, -30, + }, { + 32, -36, 11, 18, -40, 45, -30, 3, 25, -43, 43, -24, -4, 31, -45, 39, + -17, -12, 36, -45, 35, -10, -19, 40, -44, 30, -2, -26, 43, -42, 23, 6, + -32, 45, -39, 16, 13, -37, 45, -34, 9, 20, -41, 44, -29, 1, 27, -44, + 42, -22, -7, 33, -45, 38, -15, -14, 38, -45, 34, -8, -21, 41, -44, 28, + }, { + 32, -37, 15, 12, -35, 45, -39, 18, 9, -33, 45, -40, 21, 6, -30, 44, + -42, 24, 2, -28, 43, -43, 27, -1, -25, 42, -44, 30, -4, -22, 41, -45, + 32, -8, -19, 39, -45, 34, -11, -16, 38, -45, 36, -14, -13, 36, -45, 38, + -17, -10, 34, -45, 40, -20, -7, 31, -44, 41, -23, -3, 29, -44, 43, -26, + }, { + 32, -38, 19, 6, -29, 43, -44, 31, -9, -16, 36, -45, 40, -22, -2, 26, + -42, 45, -34, 12, 13, -34, 45, -41, 25, -1, -23, 40, -45, 36, -15, -10, + 32, -44, 43, -28, 4, 20, -39, 45, -38, 18, 7, -30, 43, -44, 30, -8, + -17, 37, -45, 39, -21, -3, 27, -42, 44, -33, 11, 14, -35, 45, -41, 24, + }, { + 32, -39, 23, -1, -21, 38, -45, 40, -25, 3, 19, -37, 45, -41, 27, -6, + -17, 36, -45, 42, -29, 8, 15, -34, 44, -43, 30, -10, -13, 33, -44, 44, + -32, 12, 11, -31, 43, -44, 34, -14, -9, 30, -43, 45, -35, 16, 7, -28, + 42, -45, 36, -18, -4, 26, -41, 45, -38, 20, 2, -24, 40, -45, 39, -22, + }, { + 32, -40, 27, -8, -13, 31, -43, 45, -38, 22, -2, -18, 35, -44, 44, -34, + 17, 3, -23, 38, -45, 42, -30, 12, 9, -28, 41, -45, 40, -26, 7, 14, + -32, 43, -45, 37, -21, 1, 19, -36, 44, -44, 34, -16, -4, 24, -39, 45, + -42, 30, -11, -10, 29, -41, 45, -39, 25, -6, -15, 33, -43, 45, -36, 20, + }, { + 32, -41, 30, -14, -4, 22, -36, 44, -44, 37, -23, 6, 13, -30, 41, -45, + 42, -31, 15, 3, -21, 36, -44, 45, -38, 24, -7, -12, 29, -40, 45, -42, + 32, -16, -2, 20, -35, 44, -45, 38, -25, 8, 11, -28, 40, -45, 43, -33, + 17, 1, -19, 34, -43, 45, -39, 26, -9, -10, 27, -39, 45, -43, 34, -18, + }, { + 32, -42, 34, -20, 4, 12, -27, 38, -44, 45, -39, 28, -13, -3, 19, -33, + 42, -45, 43, -34, 21, -6, -11, 26, -38, 44, -45, 39, -29, 14, 2, -18, + 32, -41, 45, -43, 35, -22, 7, 10, -25, 37, -44, 45, -40, 30, -15, -1, + 17, -31, 41, -45, 43, -36, 23, -8, -9, 24, -36, 44, -45, 40, -30, 16, + }, { + 32, -43, 36, -26, 13, 1, -15, 28, -38, 44, -45, 42, -35, 24, -11, -3, + 17, -30, 39, -44, 45, -41, 34, -22, 9, 6, -19, 31, -40, 45, -45, 40, + -32, 20, -7, -8, 21, -33, 41, -45, 44, -39, 30, -18, 4, 10, -23, 34, + -42, 45, -44, 38, -29, 16, -2, -12, 25, -36, 43, -45, 43, -37, 27, -14, + }, { + 32, -44, 39, -31, 21, -10, -2, 14, -25, 34, -41, 45, -45, 42, -36, 28, + -17, 6, 7, -18, 29, -37, 43, -45, 44, -40, 34, -24, 13, -1, -11, 22, + -32, 39, -44, 45, -43, 38, -30, 20, -9, -3, 15, -26, 35, -41, 45, -45, + 42, -36, 27, -16, 4, 8, -19, 30, -38, 43, -45, 44, -40, 33, -23, 12, + }, { + 32, -44, 41, -36, 29, -20, 11, -1, -9, 18, -27, 34, -40, 44, -45, 45, + -42, 37, -30, 22, -13, 3, 7, -16, 25, -33, 39, -43, 45, -45, 43, -38, + 32, -24, 15, -6, -4, 14, -23, 31, -38, 42, -45, 45, -43, 39, -34, 26, + -17, 8, 2, -12, 21, -30, 36, -41, 44, -45, 44, -40, 35, -28, 19, -10, + }, { + 32, -45, 43, -39, 35, -30, 23, -16, 9, -1, -7, 14, -21, 28, -34, 38, + -42, 44, -45, 45, -43, 40, -36, 31, -25, 18, -11, 3, 4, -12, 19, -26, + 32, -37, 41, -44, 45, -45, 44, -41, 38, -33, 27, -20, 13, -6, -2, 10, + -17, 24, -30, 36, -40, 43, -45, 45, -44, 42, -39, 34, -29, 22, -15, 8, + }, { + 32, -45, 44, -42, 40, -37, 34, -30, 25, -20, 15, -10, 4, 1, -7, 12, + -17, 22, -27, 31, -35, 38, -41, 43, -44, 45, -45, 45, -43, 41, -39, 36, + -32, 28, -23, 18, -13, 8, -2, -3, 9, -14, 19, -24, 29, -33, 36, -39, + 42, -44, 45, -45, 45, -44, 43, -40, 38, -34, 30, -26, 21, -16, 11, -6, + }, { + 32, -45, 45, -44, 43, -42, 41, -39, 38, -36, 34, -31, 29, -26, 23, -20, + 17, -14, 11, -8, 4, -1, -2, 6, -9, 12, -15, 18, -21, 24, -27, 30, + -32, 34, -36, 38, -40, 41, -43, 44, -44, 45, -45, 45, -45, 45, -44, 43, + -42, 40, -39, 37, -35, 33, -30, 28, -25, 22, -19, 16, -13, 10, -7, 3, + }, { + 32, -45, 45, -45, 45, -45, 45, -45, 44, -44, 44, -44, 43, -43, 43, -42, + 42, -41, 41, -40, 40, -39, 39, -38, 38, -37, 36, -36, 35, -34, 34, -33, + 32, -31, 30, -30, 29, -28, 27, -26, 25, -24, 23, -22, 21, -20, 19, -18, + 17, -16, 15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 4, -3, 2, -1, + } +}; diff --git a/src/shaders/icc.c b/src/shaders/icc.c new file mode 100644 index 0000000..6a16cfd --- /dev/null +++ b/src/shaders/icc.c @@ -0,0 +1,781 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include "shaders.h" + +#include <libplacebo/tone_mapping.h> +#include <libplacebo/shaders/icc.h> + +const struct pl_icc_params pl_icc_default_params = { PL_ICC_DEFAULTS }; + +#ifdef PL_HAVE_LCMS + +#include <lcms2.h> +#include <lcms2_plugin.h> + +struct icc_priv { + pl_log log; + pl_cache cache; // for backwards compatibility + cmsContext cms; + cmsHPROFILE profile; + cmsHPROFILE approx; // approximation profile + float a, b, scale; // approxmation tone curve parameters and scaling + cmsCIEXYZ black; + float gamma_stddev; + uint64_t lut_sig; +}; + +static void error_callback(cmsContext cms, cmsUInt32Number code, + const char *msg) +{ + pl_log log = cmsGetContextUserData(cms); + pl_err(log, "lcms2: [%d] %s", (int) code, msg); +} + +static void set_callback(void *priv, pl_cache_obj obj) +{ + pl_icc_object icc = priv; + icc->params.cache_save(icc->params.cache_priv, obj.key, obj.data, obj.size); +} + +static pl_cache_obj get_callback(void *priv, uint64_t key) +{ + pl_icc_object icc = priv; + int s_r = icc->params.size_r, s_g = icc->params.size_g, s_b = icc->params.size_b; + size_t data_size = s_r * s_g * s_b * sizeof(uint16_t[4]); + void *data = pl_alloc(NULL, data_size); + bool ok = icc->params.cache_load(icc->params.cache_priv, key, data, data_size); + if (!ok) { + pl_free(data); + return (pl_cache_obj) {0}; + } + + return (pl_cache_obj) { + .key = key, + .data = data, + .size = data_size, + .free = pl_free, + }; +} + +void pl_icc_close(pl_icc_object *picc) +{ + pl_icc_object icc = *picc; + if (!icc) + return; + + struct icc_priv *p = PL_PRIV(icc); + cmsCloseProfile(p->approx); + cmsCloseProfile(p->profile); + cmsDeleteContext(p->cms); + pl_cache_destroy(&p->cache); + pl_free_ptr((void **) picc); +} + +static bool detect_csp(pl_icc_object icc, struct pl_raw_primaries *prim, + float *out_gamma) +{ + struct icc_priv *p = PL_PRIV(icc); + cmsHTRANSFORM tf; + cmsHPROFILE xyz = cmsCreateXYZProfileTHR(p->cms); + if (!xyz) + return false; + + // We need to use an unadapted observer to get the raw values + cmsFloat64Number prev_adapt = cmsSetAdaptationStateTHR(p->cms, 0.0); + tf = cmsCreateTransformTHR(p->cms, p->profile, TYPE_RGB_8, xyz, TYPE_XYZ_DBL, + INTENT_ABSOLUTE_COLORIMETRIC, + /* Note: These flags mostly don't do anything + * anyway, but specify them regardless */ + cmsFLAGS_NOCACHE | + cmsFLAGS_NOOPTIMIZE); + cmsSetAdaptationStateTHR(p->cms, prev_adapt); + cmsCloseProfile(xyz); + if (!tf) + return false; + + enum { + RED, + GREEN, + BLUE, + WHITE, + BLACK, + GRAY, + RAMP, + }; + + static const uint8_t test[][3] = { + [RED] = { 0xFF, 0, 0 }, + [GREEN] = { 0, 0xFF, 0 }, + [BLUE] = { 0, 0, 0xFF }, + [WHITE] = { 0xFF, 0xFF, 0xFF }, + [BLACK] = { 0x00, 0x00, 0x00 }, + [GRAY] = { 0x80, 0x80, 0x80 }, + + // Grayscale ramp (excluding endpoints) +#define V(d) { d, d, d } + V(0x01), V(0x02), V(0x03), V(0x04), V(0x05), V(0x06), V(0x07), + V(0x08), V(0x09), V(0x0A), V(0x0B), V(0x0C), V(0x0D), V(0x0E), V(0x0F), + V(0x10), V(0x11), V(0x12), V(0x13), V(0x14), V(0x15), V(0x16), V(0x17), + V(0x18), V(0x19), V(0x1A), V(0x1B), V(0x1C), V(0x1D), V(0x1E), V(0x1F), + V(0x20), V(0x21), V(0x22), V(0x23), V(0x24), V(0x25), V(0x26), V(0x27), + V(0x28), V(0x29), V(0x2A), V(0x2B), V(0x2C), V(0x2D), V(0x2E), V(0x2F), + V(0x30), V(0x31), V(0x32), V(0x33), V(0x34), V(0x35), V(0x36), V(0x37), + V(0x38), V(0x39), V(0x3A), V(0x3B), V(0x3C), V(0x3D), V(0x3E), V(0x3F), + V(0x40), V(0x41), V(0x42), V(0x43), V(0x44), V(0x45), V(0x46), V(0x47), + V(0x48), V(0x49), V(0x4A), V(0x4B), V(0x4C), V(0x4D), V(0x4E), V(0x4F), + V(0x50), V(0x51), V(0x52), V(0x53), V(0x54), V(0x55), V(0x56), V(0x57), + V(0x58), V(0x59), V(0x5A), V(0x5B), V(0x5C), V(0x5D), V(0x5E), V(0x5F), + V(0x60), V(0x61), V(0x62), V(0x63), V(0x64), V(0x65), V(0x66), V(0x67), + V(0x68), V(0x69), V(0x6A), V(0x6B), V(0x6C), V(0x6D), V(0x6E), V(0x6F), + V(0x70), V(0x71), V(0x72), V(0x73), V(0x74), V(0x75), V(0x76), V(0x77), + V(0x78), V(0x79), V(0x7A), V(0x7B), V(0x7C), V(0x7D), V(0x7E), V(0x7F), + V(0x80), V(0x81), V(0x82), V(0x83), V(0x84), V(0x85), V(0x86), V(0x87), + V(0x88), V(0x89), V(0x8A), V(0x8B), V(0x8C), V(0x8D), V(0x8E), V(0x8F), + V(0x90), V(0x91), V(0x92), V(0x93), V(0x94), V(0x95), V(0x96), V(0x97), + V(0x98), V(0x99), V(0x9A), V(0x9B), V(0x9C), V(0x9D), V(0x9E), V(0x9F), + V(0xA0), V(0xA1), V(0xA2), V(0xA3), V(0xA4), V(0xA5), V(0xA6), V(0xA7), + V(0xA8), V(0xA9), V(0xAA), V(0xAB), V(0xAC), V(0xAD), V(0xAE), V(0xAF), + V(0xB0), V(0xB1), V(0xB2), V(0xB3), V(0xB4), V(0xB5), V(0xB6), V(0xB7), + V(0xB8), V(0xB9), V(0xBA), V(0xBB), V(0xBC), V(0xBD), V(0xBE), V(0xBF), + V(0xC0), V(0xC1), V(0xC2), V(0xC3), V(0xC4), V(0xC5), V(0xC6), V(0xC7), + V(0xC8), V(0xC9), V(0xCA), V(0xCB), V(0xCC), V(0xCD), V(0xCE), V(0xCF), + V(0xD0), V(0xD1), V(0xD2), V(0xD3), V(0xD4), V(0xD5), V(0xD6), V(0xD7), + V(0xD8), V(0xD9), V(0xDA), V(0xDB), V(0xDC), V(0xDD), V(0xDE), V(0xDF), + V(0xE0), V(0xE1), V(0xE2), V(0xE3), V(0xE4), V(0xE5), V(0xE6), V(0xE7), + V(0xE8), V(0xE9), V(0xEA), V(0xEB), V(0xEC), V(0xED), V(0xEE), V(0xEF), + V(0xF0), V(0xF1), V(0xF2), V(0xF3), V(0xF4), V(0xF5), V(0xF6), V(0xF7), + V(0xF8), V(0xF9), V(0xFA), V(0xFB), V(0xFC), V(0xFD), V(0xFE), +#undef V + }; + + cmsCIEXYZ dst[PL_ARRAY_SIZE(test)] = {0}; + cmsDoTransform(tf, test, dst, PL_ARRAY_SIZE(dst)); + cmsDeleteTransform(tf); + + // Read primaries from transformed RGBW values + prim->red = pl_cie_from_XYZ(dst[RED].X, dst[RED].Y, dst[RED].Z); + prim->green = pl_cie_from_XYZ(dst[GREEN].X, dst[GREEN].Y, dst[GREEN].Z); + prim->blue = pl_cie_from_XYZ(dst[BLUE].X, dst[BLUE].Y, dst[BLUE].Z); + prim->white = pl_cie_from_XYZ(dst[WHITE].X, dst[WHITE].Y, dst[WHITE].Z); + + // Rough estimate of overall gamma and starting point for curve black point + const float y_approx = dst[GRAY].Y ? log(dst[GRAY].Y) / log(0.5) : 1.0f; + const float kb = fmaxf(dst[BLACK].Y, 0.0f); + float b = powf(kb, 1 / y_approx); + + // Estimate mean and stddev of gamma (Welford's method) + float M = 0.0, S = 0.0; + int k = 1; + for (int i = RAMP; i < PL_ARRAY_SIZE(dst); i++) { // exclude primaries + if (dst[i].Y <= 0 || dst[i].Y >= 1) + continue; + float src = (1 - b) * (test[i][0] / 255.0) + b; + float y = log(dst[i].Y) / log(src); + float tmpM = M; + M += (y - tmpM) / k; + S += (y - tmpM) * (y - M); + k++; + + // Update estimate of black point according to current gamma estimate + b = powf(kb, 1 / M); + } + S = sqrt(S / (k - 1)); + + PL_INFO(p, "Detected profile approximation gamma %.3f", M); + if (S > 0.5) { + PL_WARN(p, "Detected profile gamma (%.3f) very far from pure power " + "response (stddev=%.1f), suspected unusual or broken profile. " + "Using anyway, but results may be poor.", M, S); + } else if (!(M > 0)) { + PL_ERR(p, "Arithmetic error in ICC profile gamma estimation? " + "Please open an issue"); + return false; + } + + *out_gamma = M; + p->gamma_stddev = S; + return true; +} + +static bool detect_contrast(pl_icc_object icc, struct pl_hdr_metadata *hdr, + struct pl_icc_params *params, float max_luma) +{ + struct icc_priv *p = PL_PRIV(icc); + cmsCIEXYZ *white = cmsReadTag(p->profile, cmsSigLuminanceTag); + enum pl_rendering_intent intent = params->intent; + /* LittleCMS refuses to detect an intent in absolute colorimetric intent, + * so fall back to relative colorimetric since we only care about the + * brightness value here */ + if (intent == PL_INTENT_ABSOLUTE_COLORIMETRIC) + intent = PL_INTENT_RELATIVE_COLORIMETRIC; + if (!cmsDetectDestinationBlackPoint(&p->black, p->profile, intent, 0)) { + /* + * v4 ICC profiles have a black point tag but only for + * perceptual/saturation intents. So we change the rendering intent + * to perceptual if we are provided a v4 ICC profile. + */ + if (cmsGetEncodedICCversion(p->profile) >= 0x4000000 && intent != PL_INTENT_PERCEPTUAL) { + params->intent = PL_INTENT_PERCEPTUAL; + return detect_contrast(icc, hdr, params, max_luma); + } + + PL_ERR(p, "Failed detecting ICC profile black point!"); + return false; + } + + if (white) { + PL_DEBUG(p, "Detected raw white point X=%.2f Y=%.2f Z=%.2f cd/m^2", + white->X, white->Y, white->Z); + } + PL_DEBUG(p, "Detected raw black point X=%.6f%% Y=%.6f%% Z=%.6f%%", + p->black.X * 100, p->black.Y * 100, p->black.Z * 100); + + if (max_luma <= 0) + max_luma = white ? white->Y : PL_COLOR_SDR_WHITE; + + hdr->max_luma = max_luma; + hdr->min_luma = p->black.Y * max_luma; + hdr->min_luma = PL_MAX(hdr->min_luma, 1e-6); // prevent true 0 + PL_INFO(p, "Using ICC contrast %.0f:1", hdr->max_luma / hdr->min_luma); + return true; +} + +static void infer_clut_size(struct pl_icc_object_t *icc) +{ + struct icc_priv *p = PL_PRIV(icc); + struct pl_icc_params *params = &icc->params; + if (params->size_r && params->size_g && params->size_b) { + PL_DEBUG(p, "Using fixed 3DLUT size: %dx%dx%d", + (int) params->size_r, (int) params->size_g, (int) params->size_b); + return; + } + +#define REQUIRE_SIZE(N) \ + params->size_r = PL_MAX(params->size_r, N); \ + params->size_g = PL_MAX(params->size_g, N); \ + params->size_b = PL_MAX(params->size_b, N) + + // Default size for sanity + REQUIRE_SIZE(9); + + // Ensure enough precision to track the (absolute) black point + if (p->black.Y > 1e-4) { + float black_rel = powf(p->black.Y, 1.0f / icc->gamma); + int min_size = 2 * (int) ceilf(1.0f / black_rel); + REQUIRE_SIZE(min_size); + } + + // Ensure enough precision to track the gamma curve + if (p->gamma_stddev > 1e-2) { + REQUIRE_SIZE(65); + } else if (p->gamma_stddev > 1e-3) { + REQUIRE_SIZE(33); + } else if (p->gamma_stddev > 1e-4) { + REQUIRE_SIZE(17); + } + + // Ensure enough precision to track any internal CLUTs + cmsPipeline *pipe = NULL; + switch (icc->params.intent) { + case PL_INTENT_SATURATION: + pipe = cmsReadTag(p->profile, cmsSigBToA2Tag); + if (pipe) + break; + // fall through + case PL_INTENT_RELATIVE_COLORIMETRIC: + case PL_INTENT_ABSOLUTE_COLORIMETRIC: + default: + pipe = cmsReadTag(p->profile, cmsSigBToA1Tag); + if (pipe) + break; + // fall through + case PL_INTENT_PERCEPTUAL: + pipe = cmsReadTag(p->profile, cmsSigBToA0Tag); + break; + } + + if (!pipe) { + switch (icc->params.intent) { + case PL_INTENT_SATURATION: + pipe = cmsReadTag(p->profile, cmsSigAToB2Tag); + if (pipe) + break; + // fall through + case PL_INTENT_RELATIVE_COLORIMETRIC: + case PL_INTENT_ABSOLUTE_COLORIMETRIC: + default: + pipe = cmsReadTag(p->profile, cmsSigAToB1Tag); + if (pipe) + break; + // fall through + case PL_INTENT_PERCEPTUAL: + pipe = cmsReadTag(p->profile, cmsSigAToB0Tag); + break; + } + } + + if (pipe) { + for (cmsStage *stage = cmsPipelineGetPtrToFirstStage(pipe); + stage; stage = cmsStageNext(stage)) + { + switch (cmsStageType(stage)) { + case cmsSigCLutElemType: ; + _cmsStageCLutData *data = cmsStageData(stage); + if (data->Params->nInputs != 3) + continue; + params->size_r = PL_MAX(params->size_r, data->Params->nSamples[0]); + params->size_g = PL_MAX(params->size_g, data->Params->nSamples[1]); + params->size_b = PL_MAX(params->size_b, data->Params->nSamples[2]); + break; + + default: + continue; + } + } + } + + // Clamp the output size to make sure profiles are not too large + params->size_r = PL_MIN(params->size_r, 129); + params->size_g = PL_MIN(params->size_g, 129); + params->size_b = PL_MIN(params->size_b, 129); + + // Constrain the total LUT size to roughly 1M entries + const size_t max_size = 1000000; + size_t total_size = params->size_r * params->size_g * params->size_b; + if (total_size > max_size) { + float factor = powf((float) max_size / total_size, 1/3.0f); + params->size_r = ceilf(factor * params->size_r); + params->size_g = ceilf(factor * params->size_g); + params->size_b = ceilf(factor * params->size_b); + } + + PL_INFO(p, "Chosen 3DLUT size: %dx%dx%d", + (int) params->size_r, (int) params->size_g, (int) params->size_b); +} + +static bool icc_init(struct pl_icc_object_t *icc) +{ + struct icc_priv *p = PL_PRIV(icc); + struct pl_icc_params *params = &icc->params; + if (params->intent < 0 || params->intent > PL_INTENT_ABSOLUTE_COLORIMETRIC) + params->intent = cmsGetHeaderRenderingIntent(p->profile); + + struct pl_raw_primaries *out_prim = &icc->csp.hdr.prim; + if (!detect_csp(icc, out_prim, &icc->gamma)) + return false; + if (!detect_contrast(icc, &icc->csp.hdr, params, params->max_luma)) + return false; + infer_clut_size(icc); + + const struct pl_raw_primaries *best = NULL; + for (enum pl_color_primaries prim = 1; prim < PL_COLOR_PRIM_COUNT; prim++) { + const struct pl_raw_primaries *raw = pl_raw_primaries_get(prim); + if (!icc->csp.primaries && pl_raw_primaries_similar(raw, out_prim)) { + icc->containing_primaries = prim; + icc->csp.primaries = prim; + best = raw; + break; + } + + if (pl_primaries_superset(raw, out_prim) && + (!best || pl_primaries_superset(best, raw))) + { + icc->containing_primaries = prim; + best = raw; + } + } + + if (!best) { + PL_WARN(p, "ICC profile too wide to handle, colors may be clipped!"); + icc->containing_primaries = PL_COLOR_PRIM_ACES_AP0; + best = pl_raw_primaries_get(icc->containing_primaries); + } + + // Create approximation profile. Use a tone-curve based on a BT.1886-style + // pure power curve, with an approximation gamma matched to the ICC + // profile. We stretch the luminance range *before* the input to the gamma + // function, to avoid numerical issues near the black point. (This removes + // the need for a separate linear section) + // + // Y = scale * (aX + b)^y, where Y = PCS luma and X = encoded value ([0-1]) + p->scale = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_NORM, icc->csp.hdr.max_luma); + p->b = powf(icc->csp.hdr.min_luma / icc->csp.hdr.max_luma, 1.0f / icc->gamma); + p->a = (1 - p->b); + cmsToneCurve *curve = cmsBuildParametricToneCurve(p->cms, 2, + (double[3]) { icc->gamma, p->a, p->b }); + if (!curve) + return false; + + cmsCIExyY wp_xyY = { best->white.x, best->white.y, 1.0 }; + cmsCIExyYTRIPLE prim_xyY = { + .Red = { best->red.x, best->red.y, 1.0 }, + .Green = { best->green.x, best->green.y, 1.0 }, + .Blue = { best->blue.x, best->blue.y, 1.0 }, + }; + + p->approx = cmsCreateRGBProfileTHR(p->cms, &wp_xyY, &prim_xyY, + (cmsToneCurve *[3]){ curve, curve, curve }); + cmsFreeToneCurve(curve); + if (!p->approx) + return false; + + // We need to create an ICC V2 profile because ICC V4 perceptual profiles + // have normalized semantics, but we want colorimetric mapping with BPC + cmsSetHeaderRenderingIntent(p->approx, icc->params.intent); + cmsSetProfileVersion(p->approx, 2.2); + + // Hash all parameters affecting the generated 3DLUT + p->lut_sig = CACHE_KEY_ICC_3DLUT; + pl_hash_merge(&p->lut_sig, icc->signature); + pl_hash_merge(&p->lut_sig, params->intent); + pl_hash_merge(&p->lut_sig, params->size_r); + pl_hash_merge(&p->lut_sig, params->size_g); + pl_hash_merge(&p->lut_sig, params->size_b); + pl_hash_merge(&p->lut_sig, params->force_bpc); + union { double d; uint64_t u; } v = { .d = icc->csp.hdr.max_luma }; + pl_hash_merge(&p->lut_sig, v.u); + // min luma depends only on the max luma and profile + + // Backwards compatibility with old caching API + if ((params->cache_save || params->cache_load) && !params->cache) { + p->cache = pl_cache_create(pl_cache_params( + .log = p->log, + .set = params->cache_save ? set_callback : NULL, + .get = params->cache_load ? get_callback : NULL, + .priv = icc, + )); + } + + return true; +} + +pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile, + const struct pl_icc_params *params) +{ + if (!profile->len) + return NULL; + + struct pl_icc_object_t *icc = pl_zalloc_obj(NULL, icc, struct icc_priv); + struct icc_priv *p = PL_PRIV(icc); + icc->params = params ? *params : pl_icc_default_params; + icc->signature = profile->signature; + p->log = log; + p->cms = cmsCreateContext(NULL, (void *) log); + if (!p->cms) { + PL_ERR(p, "Failed creating LittleCMS context!"); + goto error; + } + + cmsSetLogErrorHandlerTHR(p->cms, error_callback); + PL_INFO(p, "Opening ICC profile.."); + p->profile = cmsOpenProfileFromMemTHR(p->cms, profile->data, profile->len); + if (!p->profile) { + PL_ERR(p, "Failed opening ICC profile"); + goto error; + } + + if (cmsGetColorSpace(p->profile) != cmsSigRgbData) { + PL_ERR(p, "Invalid ICC profile: not RGB"); + goto error; + } + + if (!icc_init(icc)) + goto error; + + return icc; + +error: + pl_icc_close((pl_icc_object *) &icc); + return NULL; +} + +static bool icc_reopen(pl_icc_object kicc, const struct pl_icc_params *params) +{ + struct pl_icc_object_t *icc = (struct pl_icc_object_t *) kicc; + struct icc_priv *p = PL_PRIV(icc); + cmsCloseProfile(p->approx); + pl_cache_destroy(&p->cache); + + *icc = (struct pl_icc_object_t) { + .params = *params, + .signature = icc->signature, + }; + + *p = (struct icc_priv) { + .log = p->log, + .cms = p->cms, + .profile = p->profile, + }; + + PL_DEBUG(p, "Reinitializing ICC profile in-place"); + return icc_init(icc); +} + +bool pl_icc_update(pl_log log, pl_icc_object *out_icc, + const struct pl_icc_profile *profile, + const struct pl_icc_params *params) +{ + params = PL_DEF(params, &pl_icc_default_params); + pl_icc_object icc = *out_icc; + if (!icc && !profile) + return false; // nothing to update + + uint64_t sig = profile ? profile->signature : icc->signature; + if (!icc || icc->signature != sig) { + pl_assert(profile); + pl_icc_close(&icc); + *out_icc = icc = pl_icc_open(log, profile, params); + return icc != NULL; + } + + int size_r = PL_DEF(params->size_r, icc->params.size_r); + int size_g = PL_DEF(params->size_g, icc->params.size_g); + int size_b = PL_DEF(params->size_b, icc->params.size_b); + bool compat = params->intent == icc->params.intent && + params->max_luma == icc->params.max_luma && + params->force_bpc == icc->params.force_bpc && + size_r == icc->params.size_r && + size_g == icc->params.size_g && + size_b == icc->params.size_b; + if (compat) + return true; + + // ICC signature is the same but parameters are different, re-open in-place + if (!icc_reopen(icc, params)) { + pl_icc_close(&icc); + *out_icc = NULL; + return false; + } + + return true; +} + +static void fill_lut(void *datap, const struct sh_lut_params *params, bool decode) +{ + pl_icc_object icc = params->priv; + struct icc_priv *p = PL_PRIV(icc); + cmsHPROFILE srcp = decode ? p->profile : p->approx; + cmsHPROFILE dstp = decode ? p->approx : p->profile; + int s_r = params->width, s_g = params->height, s_b = params->depth; + + pl_clock_t start = pl_clock_now(); + cmsHTRANSFORM tf = cmsCreateTransformTHR(p->cms, srcp, TYPE_RGB_16, + dstp, TYPE_RGBA_16, + icc->params.intent, + cmsFLAGS_BLACKPOINTCOMPENSATION | + cmsFLAGS_NOCACHE | cmsFLAGS_NOOPTIMIZE); + if (!tf) + return; + + pl_clock_t after_transform = pl_clock_now(); + pl_log_cpu_time(p->log, start, after_transform, "creating ICC transform"); + + uint16_t *tmp = pl_alloc(NULL, s_r * 3 * sizeof(tmp[0])); + for (int b = 0; b < s_b; b++) { + for (int g = 0; g < s_g; g++) { + // Transform a single line of the output buffer + for (int r = 0; r < s_r; r++) { + tmp[r * 3 + 0] = r * 65535 / (s_r - 1); + tmp[r * 3 + 1] = g * 65535 / (s_g - 1); + tmp[r * 3 + 2] = b * 65535 / (s_b - 1); + } + + size_t offset = (b * s_g + g) * s_r * 4; + uint16_t *data = ((uint16_t *) datap) + offset; + cmsDoTransform(tf, tmp, data, s_r); + + if (!icc->params.force_bpc) + continue; + + // Fix the black point manually. Work-around for "improper" + // profiles, as black point compensation should already have + // taken care of this normally. + const uint16_t knee = 16u << 8; + if (tmp[0] >= knee || tmp[1] >= knee) + continue; + for (int r = 0; r < s_r; r++) { + uint16_t s = (2 * tmp[1] + tmp[2] + tmp[r * 3]) >> 2; + if (s >= knee) + break; + for (int c = 0; c < 3; c++) + data[r * 3 + c] = (s * data[r * 3 + c] + (knee - s) * s) >> 12; + } + } + } + + pl_log_cpu_time(p->log, after_transform, pl_clock_now(), "generating ICC 3DLUT"); + cmsDeleteTransform(tf); + pl_free(tmp); +} + +static void fill_decode(void *datap, const struct sh_lut_params *params) +{ + fill_lut(datap, params, true); +} + +static void fill_encode(void *datap, const struct sh_lut_params *params) +{ + fill_lut(datap, params, false); +} + +static pl_cache get_cache(pl_icc_object icc, pl_shader sh) +{ + struct icc_priv *p = PL_PRIV(icc); + return PL_DEF(icc->params.cache, PL_DEF(p->cache, SH_CACHE(sh))); +} + +void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj, + struct pl_color_space *out_csp) +{ + struct icc_priv *p = PL_PRIV(icc); + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR); + if (!fmt) { + SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!"); + return; + } + + ident_t lut = sh_lut(sh, sh_lut_params( + .object = lut_obj, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_TETRAHEDRAL, + .fmt = fmt, + .width = icc->params.size_r, + .height = icc->params.size_g, + .depth = icc->params.size_b, + .comps = 4, + .signature = p->lut_sig, + .fill = fill_decode, + .cache = get_cache(icc, sh), + .priv = (void *) icc, + )); + + if (!lut) { + SH_FAIL(sh, "pl_icc_decode: failed generating LUT object"); + return; + } + + // Y = scale * (aX + b)^y + sh_describe(sh, "ICC 3DLUT"); + GLSL("// pl_icc_decode \n" + "{ \n" + "color.rgb = "$"(color.rgb).rgb; \n" + "color.rgb = "$" * color.rgb + vec3("$"); \n" + "color.rgb = pow(color.rgb, vec3("$")); \n" + "color.rgb = "$" * color.rgb; \n" + "} \n", + lut, + SH_FLOAT(p->a), SH_FLOAT(p->b), + SH_FLOAT(icc->gamma), + SH_FLOAT(p->scale)); + + if (out_csp) { + *out_csp = (struct pl_color_space) { + .primaries = icc->containing_primaries, + .transfer = PL_COLOR_TRC_LINEAR, + .hdr = icc->csp.hdr, + }; + } +} + +void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj) +{ + struct icc_priv *p = PL_PRIV(icc); + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR); + if (!fmt) { + SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!"); + return; + } + + ident_t lut = sh_lut(sh, sh_lut_params( + .object = lut_obj, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_TETRAHEDRAL, + .fmt = fmt, + .width = icc->params.size_r, + .height = icc->params.size_g, + .depth = icc->params.size_b, + .comps = 4, + .signature = ~p->lut_sig, // avoid confusion with decoding LUTs + .fill = fill_encode, + .cache = get_cache(icc, sh), + .priv = (void *) icc, + )); + + if (!lut) { + SH_FAIL(sh, "pl_icc_encode: failed generating LUT object"); + return; + } + + // X = 1/a * (Y/scale)^(1/y) - b/a + sh_describe(sh, "ICC 3DLUT"); + GLSL("// pl_icc_encode \n" + "{ \n" + "color.rgb = max(color.rgb, 0.0); \n" + "color.rgb = 1.0/"$" * color.rgb; \n" + "color.rgb = pow(color.rgb, vec3("$")); \n" + "color.rgb = 1.0/"$" * color.rgb - "$"; \n" + "color.rgb = "$"(color.rgb).rgb; \n" + "} \n", + SH_FLOAT(p->scale), + SH_FLOAT(1.0f / icc->gamma), + SH_FLOAT(p->a), SH_FLOAT(p->b / p->a), + lut); +} + +#else // !PL_HAVE_LCMS + +void pl_icc_close(pl_icc_object *picc) {}; +pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile, + const struct pl_icc_params *pparams) +{ + pl_err(log, "libplacebo compiled without LittleCMS 2 support!"); + return NULL; +} + +bool pl_icc_update(pl_log log, pl_icc_object *obj, + const struct pl_icc_profile *profile, + const struct pl_icc_params *params) +{ + static bool warned; + if (!warned) { + pl_err(log, "libplacebo compiled without LittleCMS 2 support!"); + warned = true; + } + *obj = NULL; + return false; +} + +void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj, + struct pl_color_space *out_csp) +{ + pl_unreachable(); // can't get a pl_icc_object +} + +void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj) +{ + pl_unreachable(); +} + +#endif diff --git a/src/shaders/lut.c b/src/shaders/lut.c new file mode 100644 index 0000000..b0124fc --- /dev/null +++ b/src/shaders/lut.c @@ -0,0 +1,820 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include <ctype.h> + +#include "shaders.h" + +#include <libplacebo/shaders/lut.h> + +static inline bool isnumeric(char c) +{ + return (c >= '0' && c <= '9') || c == '-'; +} + +void pl_lut_free(struct pl_custom_lut **lut) +{ + pl_free_ptr(lut); +} + +struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *cstr, size_t cstr_len) +{ + struct pl_custom_lut *lut = pl_zalloc_ptr(NULL, lut); + pl_str str = (pl_str) { (uint8_t *) cstr, cstr_len }; + lut->signature = pl_str_hash(str); + int entries = 0; + + float min[3] = { 0.0, 0.0, 0.0 }; + float max[3] = { 1.0, 1.0, 1.0 }; + + // Parse header + while (str.len && !isnumeric(str.buf[0])) { + pl_str line = pl_str_strip(pl_str_getline(str, &str)); + if (!line.len) + continue; // skip empty line + + if (pl_str_eatstart0(&line, "TITLE")) { + pl_info(log, "Loading LUT: %.*s", PL_STR_FMT(pl_str_strip(line))); + continue; + } + + if (pl_str_eatstart0(&line, "LUT_3D_SIZE")) { + line = pl_str_strip(line); + int size; + if (!pl_str_parse_int(line, &size)) { + pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line)); + goto error; + } + if (size <= 0 || size > 1024) { + pl_err(log, "Invalid 3DLUT size: %dx%d%x", size, size, size); + goto error; + } + + lut->size[0] = lut->size[1] = lut->size[2] = size; + entries = size * size * size; + continue; + } + + if (pl_str_eatstart0(&line, "LUT_1D_SIZE")) { + line = pl_str_strip(line); + int size; + if (!pl_str_parse_int(line, &size)) { + pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line)); + goto error; + } + if (size <= 0 || size > 65536) { + pl_err(log, "Invalid 1DLUT size: %d", size); + goto error; + } + + lut->size[0] = size; + lut->size[1] = lut->size[2] = 0; + entries = size; + continue; + } + + if (pl_str_eatstart0(&line, "DOMAIN_MIN")) { + line = pl_str_strip(line); + if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[0]) || + !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[1]) || + !pl_str_parse_float(line, &min[2])) + { + pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line)); + goto error; + } + continue; + } + + if (pl_str_eatstart0(&line, "DOMAIN_MAX")) { + line = pl_str_strip(line); + if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[0]) || + !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[1]) || + !pl_str_parse_float(line, &max[2])) + { + pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line)); + goto error; + } + continue; + } + + if (pl_str_eatstart0(&line, "#")) { + pl_debug(log, "Unhandled .cube comment: %.*s", + PL_STR_FMT(pl_str_strip(line))); + continue; + } + + pl_warn(log, "Unhandled .cube line: %.*s", PL_STR_FMT(pl_str_strip(line))); + } + + if (!entries) { + pl_err(log, "Missing LUT size specification?"); + goto error; + } + + for (int i = 0; i < 3; i++) { + if (max[i] - min[i] < 1e-6) { + pl_err(log, "Invalid domain range: [%f, %f]", min[i], max[i]); + goto error; + } + } + + float *data = pl_alloc(lut, sizeof(float[3]) * entries); + lut->data = data; + + // Parse LUT body + pl_clock_t start = pl_clock_now(); + for (int n = 0; n < entries; n++) { + for (int c = 0; c < 3; c++) { + static const char * const digits = "0123456789.-+e"; + + // Extract valid digit sequence + size_t len = pl_strspn(str, digits); + pl_str entry = (pl_str) { str.buf, len }; + str.buf += len; + str.len -= len; + + if (!entry.len) { + if (!str.len) { + pl_err(log, "Failed parsing LUT: Unexpected EOF, expected " + "%d entries, got %d", entries * 3, n * 3 + c + 1); + } else { + pl_err(log, "Failed parsing LUT: Unexpected '%c', expected " + "digit", str.buf[0]); + } + goto error; + } + + float num; + if (!pl_str_parse_float(entry, &num)) { + pl_err(log, "Failed parsing float value '%.*s'", PL_STR_FMT(entry)); + goto error; + } + + // Rescale to range 0.0 - 1.0 + *data++ = (num - min[c]) / (max[c] - min[c]); + + // Skip whitespace between digits + str = pl_str_strip(str); + } + } + + str = pl_str_strip(str); + if (str.len) + pl_warn(log, "Extra data after LUT?... ignoring '%c'", str.buf[0]); + + pl_log_cpu_time(log, start, pl_clock_now(), "parsing .cube LUT"); + return lut; + +error: + pl_free(lut); + return NULL; +} + +static void fill_lut(void *datap, const struct sh_lut_params *params) +{ + const struct pl_custom_lut *lut = params->priv; + + int dim_r = params->width; + int dim_g = PL_DEF(params->height, 1); + int dim_b = PL_DEF(params->depth, 1); + + float *data = datap; + for (int b = 0; b < dim_b; b++) { + for (int g = 0; g < dim_g; g++) { + for (int r = 0; r < dim_r; r++) { + size_t offset = (b * dim_g + g) * dim_r + r; + const float *src = &lut->data[offset * 3]; + float *dst = &data[offset * 4]; + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = 0.0f; + } + } + } +} + +void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut, + pl_shader_obj *lut_state) +{ + if (!lut) + return; + + int dims; + if (lut->size[0] > 0 && lut->size[1] > 0 && lut->size[2] > 0) { + dims = 3; + } else if (lut->size[0] > 0 && !lut->size[1] && !lut->size[2]) { + dims = 1; + } else { + SH_FAIL(sh, "Invalid dimensions %dx%dx%d for pl_custom_lut, must be 1D " + "or 3D!", lut->size[0], lut->size[1], lut->size[2]); + return; + } + + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + ident_t fun = sh_lut(sh, sh_lut_params( + .object = lut_state, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_TETRAHEDRAL, + .width = lut->size[0], + .height = lut->size[1], + .depth = lut->size[2], + .comps = 4, // for better texel alignment + .signature = lut->signature, + .fill = fill_lut, + .priv = (void *) lut, + )); + + if (!fun) { + SH_FAIL(sh, "pl_shader_custom_lut: failed generating LUT object"); + return; + } + + GLSL("// pl_shader_custom_lut \n"); + + static const pl_matrix3x3 zero = {0}; + if (memcmp(&lut->shaper_in, &zero, sizeof(zero)) != 0) { + GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("shaper_in"), + .data = PL_TRANSPOSE_3X3(lut->shaper_in.m), + })); + } + + switch (dims) { + case 1: + sh_describe(sh, "custom 1DLUT"); + GLSL("color.rgb = vec3("$"(color.r).r, \n" + " "$"(color.g).g, \n" + " "$"(color.b).b); \n", + fun, fun, fun); + break; + case 3: + sh_describe(sh, "custom 3DLUT"); + GLSL("color.rgb = "$"(color.rgb).rgb; \n", fun); + break; + } + + if (memcmp(&lut->shaper_out, &zero, sizeof(zero)) != 0) { + GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat3("shaper_out"), + .data = PL_TRANSPOSE_3X3(lut->shaper_out.m), + })); + } +} + +// Defines a LUT position helper macro. This translates from an absolute texel +// scale (either in texels, or normalized to [0,1]) to the texture coordinate +// scale for the corresponding sample in a texture of dimension `lut_size`. +static ident_t texel_scale(pl_shader sh, int lut_size, bool normalized) +{ + const float base = 0.5f / lut_size; + const float end = 1.0f - 0.5f / lut_size; + const float scale = (end - base) / (normalized ? 1.0f : (lut_size - 1)); + + ident_t name = sh_fresh(sh, "LUT_SCALE"); + GLSLH("#define "$"(x) ("$" * (x) + "$") \n", + name, SH_FLOAT(scale), SH_FLOAT(base)); + return name; +} + +struct sh_lut_obj { + enum sh_lut_type type; + enum sh_lut_method method; + enum pl_var_type vartype; + pl_fmt fmt; + int width, height, depth, comps; + uint64_t signature; + bool error; // reset if params change + + // weights, depending on the lut type + pl_tex tex; + pl_str str; + void *data; +}; + +static void sh_lut_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_lut_obj *lut = ptr; + pl_tex_destroy(gpu, &lut->tex); + pl_free(lut->str.buf); + pl_free(lut->data); + + *lut = (struct sh_lut_obj) {0}; +} + +// Maximum number of floats to embed as a literal array (when using SH_LUT_AUTO) +#define SH_LUT_MAX_LITERAL_SOFT 64 +#define SH_LUT_MAX_LITERAL_HARD 256 + +ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params) +{ + pl_gpu gpu = SH_GPU(sh); + pl_cache_obj obj = { .key = CACHE_KEY_SH_LUT ^ params->signature }; + + const enum pl_var_type vartype = params->var_type; + pl_assert(vartype != PL_VAR_INVALID); + pl_assert(params->method == SH_LUT_NONE || vartype == PL_VAR_FLOAT); + pl_assert(params->width > 0 && params->height >= 0 && params->depth >= 0); + pl_assert(params->comps > 0); + pl_assert(!params->cache || params->signature); + + int sizes[] = { params->width, params->height, params->depth }; + int size = params->width * PL_DEF(params->height, 1) * PL_DEF(params->depth, 1); + int dims = params->depth ? 3 : params->height ? 2 : 1; + enum sh_lut_method method = params->method; + if (method == SH_LUT_TETRAHEDRAL && dims != 3) + method = SH_LUT_LINEAR; + if (method == SH_LUT_CUBIC && dims != 3) + method = SH_LUT_LINEAR; + + int texdim = 0; + uint32_t max_tex_dim[] = { + gpu ? gpu->limits.max_tex_1d_dim : 0, + gpu ? gpu->limits.max_tex_2d_dim : 0, + (gpu && gpu->glsl.version > 100) ? gpu->limits.max_tex_3d_dim : 0, + }; + + struct sh_lut_obj *lut = SH_OBJ(sh, params->object, PL_SHADER_OBJ_LUT, + struct sh_lut_obj, sh_lut_uninit); + + if (!lut) + return NULL_IDENT; + + bool update = params->update || lut->signature != params->signature || + vartype != lut->vartype || params->fmt != lut->fmt || + params->width != lut->width || params->height != lut->height || + params->depth != lut->depth || params->comps != lut->comps; + + if (lut->error && !update) + return NULL_IDENT; // suppress error spam until something changes + + // Try picking the right number of dimensions for the texture LUT. This + // allows e.g. falling back to 2D textures if 1D textures are unsupported. + for (int d = dims; d <= PL_ARRAY_SIZE(max_tex_dim); d++) { + // For a given dimension to be compatible, all coordinates need to be + // within the maximum texture size for that dimension + for (int i = 0; i < d; i++) { + if (sizes[i] > max_tex_dim[d - 1]) + goto next_dim; + } + + // All dimensions are compatible, so pick this texture dimension + texdim = d; + break; + +next_dim: ; // `continue` out of the inner loop + } + + static const enum pl_fmt_type fmt_type[PL_VAR_TYPE_COUNT] = { + [PL_VAR_SINT] = PL_FMT_SINT, + [PL_VAR_UINT] = PL_FMT_UINT, + [PL_VAR_FLOAT] = PL_FMT_FLOAT, + }; + + enum pl_fmt_caps texcaps = PL_FMT_CAP_SAMPLEABLE; + bool is_linear = method == SH_LUT_LINEAR || method == SH_LUT_CUBIC; + if (is_linear) + texcaps |= PL_FMT_CAP_LINEAR; + + pl_fmt texfmt = params->fmt; + if (texfmt) { + bool ok; + switch (texfmt->type) { + case PL_FMT_SINT: ok = vartype == PL_VAR_SINT; break; + case PL_FMT_UINT: ok = vartype == PL_VAR_UINT; break; + default: ok = vartype == PL_VAR_FLOAT; break; + } + + if (!ok) { + PL_ERR(sh, "Specified texture format '%s' does not match LUT " + "data type!", texfmt->name); + goto error; + } + + if (~texfmt->caps & texcaps) { + PL_ERR(sh, "Specified texture format '%s' does not match " + "required capabilities 0x%x!\n", texfmt->name, texcaps); + goto error; + } + } + + if (texdim && !texfmt) { + texfmt = pl_find_fmt(gpu, fmt_type[vartype], params->comps, + vartype == PL_VAR_FLOAT ? 16 : 32, + pl_var_type_size(vartype) * 8, + texcaps); + } + + enum sh_lut_type type = params->lut_type; + + // The linear sampling code currently only supports 1D linear interpolation + if (is_linear && dims > 1) { + if (texfmt) { + type = SH_LUT_TEXTURE; + } else { + PL_ERR(sh, "Can't emulate linear LUTs for 2D/3D LUTs and no " + "texture support available!"); + goto error; + } + } + + bool can_uniform = gpu && gpu->limits.max_variable_comps >= size * params->comps; + bool can_literal = sh_glsl(sh).version > 110; // needed for literal arrays + can_literal &= size <= SH_LUT_MAX_LITERAL_HARD && !params->dynamic; + + // Deselect unsupported methods + if (type == SH_LUT_UNIFORM && !can_uniform) + type = SH_LUT_AUTO; + if (type == SH_LUT_LITERAL && !can_literal) + type = SH_LUT_AUTO; + if (type == SH_LUT_TEXTURE && !texfmt) + type = SH_LUT_AUTO; + + // Sorted by priority + if (!type && can_literal && !method && size <= SH_LUT_MAX_LITERAL_SOFT) + type = SH_LUT_LITERAL; + if (!type && texfmt) + type = SH_LUT_TEXTURE; + if (!type && can_uniform) + type = SH_LUT_UNIFORM; + if (!type && can_literal) + type = SH_LUT_LITERAL; + + if (!type) { + PL_ERR(sh, "Can't generate LUT: no compatible methods!"); + goto error; + } + + // Reinitialize the existing LUT if needed + update |= type != lut->type; + update |= method != lut->method; + + if (update) { + if (params->dynamic) + pl_log_level_cap(sh->log, PL_LOG_TRACE); + + size_t el_size = params->comps * pl_var_type_size(vartype); + if (type == SH_LUT_TEXTURE) + el_size = texfmt->texel_size; + + size_t buf_size = size * el_size; + if (pl_cache_get(params->cache, &obj) && obj.size == buf_size) { + PL_DEBUG(sh, "Re-using cached LUT (0x%"PRIx64") with size %zu", + obj.key, obj.size); + } else { + PL_DEBUG(sh, "LUT invalidated, regenerating.."); + pl_cache_obj_resize(NULL, &obj, buf_size); + pl_clock_t start = pl_clock_now(); + params->fill(obj.data, params); + pl_log_cpu_time(sh->log, start, pl_clock_now(), "generating shader LUT"); + } + + pl_assert(obj.data && obj.size); + if (params->dynamic) + pl_log_level_cap(sh->log, PL_LOG_NONE); + + switch (type) { + case SH_LUT_TEXTURE: { + if (!texdim) { + PL_ERR(sh, "Texture LUT exceeds texture dimensions!"); + goto error; + } + + if (!texfmt) { + PL_ERR(sh, "Found no compatible texture format for LUT!"); + goto error; + } + + struct pl_tex_params tex_params = { + .w = params->width, + .h = PL_DEF(params->height, texdim >= 2 ? 1 : 0), + .d = PL_DEF(params->depth, texdim >= 3 ? 1 : 0), + .format = texfmt, + .sampleable = true, + .host_writable = params->dynamic, + .initial_data = params->dynamic ? NULL : obj.data, + .debug_tag = params->debug_tag, + }; + + bool ok; + if (params->dynamic) { + ok = pl_tex_recreate(gpu, &lut->tex, &tex_params); + if (ok) { + ok = pl_tex_upload(gpu, pl_tex_transfer_params( + .tex = lut->tex, + .ptr = obj.data, + )); + } + } else { + // Can't use pl_tex_recreate because of `initial_data` + pl_tex_destroy(gpu, &lut->tex); + lut->tex = pl_tex_create(gpu, &tex_params); + ok = lut->tex; + } + + if (!ok) { + PL_ERR(sh, "Failed creating LUT texture!"); + goto error; + } + break; + } + + case SH_LUT_UNIFORM: + pl_free(lut->data); + lut->data = pl_memdup(NULL, obj.data, obj.size); + break; + + case SH_LUT_LITERAL: { + lut->str.len = 0; + static const char prefix[PL_VAR_TYPE_COUNT] = { + [PL_VAR_SINT] = 'i', + [PL_VAR_UINT] = 'u', + [PL_VAR_FLOAT] = ' ', + }; + + for (int i = 0; i < size * params->comps; i += params->comps) { + if (i > 0) + pl_str_append_asprintf_c(lut, &lut->str, ","); + if (params->comps > 1) { + pl_str_append_asprintf_c(lut, &lut->str, "%cvec%d(", + prefix[vartype], params->comps); + } + for (int c = 0; c < params->comps; c++) { + switch (vartype) { + case PL_VAR_FLOAT: + pl_str_append_asprintf_c(lut, &lut->str, "%s%f", + c > 0 ? "," : "", + ((float *) obj.data)[i+c]); + break; + case PL_VAR_UINT: + pl_str_append_asprintf_c(lut, &lut->str, "%s%u", + c > 0 ? "," : "", + ((unsigned int *) obj.data)[i+c]); + break; + case PL_VAR_SINT: + pl_str_append_asprintf_c(lut, &lut->str, "%s%d", + c > 0 ? "," : "", + ((int *) obj.data)[i+c]); + break; + case PL_VAR_INVALID: + case PL_VAR_TYPE_COUNT: + pl_unreachable(); + } + } + if (params->comps > 1) + pl_str_append_asprintf_c(lut, &lut->str, ")"); + } + break; + } + + case SH_LUT_AUTO: + pl_unreachable(); + } + + lut->type = type; + lut->method = method; + lut->vartype = vartype; + lut->fmt = params->fmt; + lut->width = params->width; + lut->height = params->height; + lut->depth = params->depth; + lut->comps = params->comps; + lut->signature = params->signature; + pl_cache_set(params->cache, &obj); + } + + // Done updating, generate the GLSL + ident_t name = sh_fresh(sh, "lut"); + ident_t arr_name = NULL_IDENT; + + static const char * const swizzles[] = {"x", "xy", "xyz", "xyzw"}; + static const char * const vartypes[PL_VAR_TYPE_COUNT][4] = { + [PL_VAR_SINT] = { "int", "ivec2", "ivec3", "ivec4" }, + [PL_VAR_UINT] = { "uint", "uvec2", "uvec3", "uvec4" }, + [PL_VAR_FLOAT] = { "float", "vec2", "vec3", "vec4" }, + }; + + switch (type) { + case SH_LUT_TEXTURE: { + assert(texdim); + ident_t tex = sh_desc(sh, (struct pl_shader_desc) { + .desc = { + .name = "weights", + .type = PL_DESC_SAMPLED_TEX, + }, + .binding = { + .object = lut->tex, + .sample_mode = is_linear ? PL_TEX_SAMPLE_LINEAR + : PL_TEX_SAMPLE_NEAREST, + } + }); + + if (is_linear) { + ident_t pos_macros[PL_ARRAY_SIZE(sizes)] = {0}; + for (int i = 0; i < dims; i++) + pos_macros[i] = texel_scale(sh, sizes[i], true); + + GLSLH("#define "$"(pos) (textureLod("$", %s(\\\n", + name, tex, vartypes[PL_VAR_FLOAT][texdim - 1]); + + for (int i = 0; i < texdim; i++) { + char sep = i == 0 ? ' ' : ','; + if (pos_macros[i]) { + if (dims > 1) { + GLSLH(" %c"$"(%s(pos).%c)\\\n", sep, pos_macros[i], + vartypes[PL_VAR_FLOAT][dims - 1], "xyzw"[i]); + } else { + GLSLH(" %c"$"(float(pos))\\\n", sep, pos_macros[i]); + } + } else { + GLSLH(" %c%f\\\n", sep, 0.5); + } + } + GLSLH(" ), 0.0).%s)\n", swizzles[params->comps - 1]); + } else { + GLSLH("#define "$"(pos) (texelFetch("$", %s(pos", + name, tex, vartypes[PL_VAR_SINT][texdim - 1]); + + // Fill up extra components of the index + for (int i = dims; i < texdim; i++) + GLSLH(", 0"); + + GLSLH("), 0).%s)\n", swizzles[params->comps - 1]); + } + break; + } + + case SH_LUT_UNIFORM: + arr_name = sh_var(sh, (struct pl_shader_var) { + .var = { + .name = "weights", + .type = vartype, + .dim_v = params->comps, + .dim_m = 1, + .dim_a = size, + }, + .data = lut->data, + }); + break; + + case SH_LUT_LITERAL: + arr_name = sh_fresh(sh, "weights"); + GLSLH("const %s "$"[%d] = %s[](\n ", + vartypes[vartype][params->comps - 1], arr_name, size, + vartypes[vartype][params->comps - 1]); + sh_append_str(sh, SH_BUF_HEADER, lut->str); + GLSLH(");\n"); + break; + + case SH_LUT_AUTO: + pl_unreachable(); + } + + if (arr_name) { + GLSLH("#define "$"(pos) ("$"[int((pos)%s)\\\n", + name, arr_name, dims > 1 ? "[0]" : ""); + int shift = params->width; + for (int i = 1; i < dims; i++) { + GLSLH(" + %d * int((pos)[%d])\\\n", shift, i); + shift *= sizes[i]; + } + GLSLH(" ])\n"); + + if (is_linear) { + pl_assert(dims == 1); + pl_assert(vartype == PL_VAR_FLOAT); + ident_t arr_lut = name; + name = sh_fresh(sh, "lut_lin"); + GLSLH("%s "$"(float fpos) { \n" + " fpos = clamp(fpos, 0.0, 1.0) * %d.0; \n" + " float fbase = floor(fpos); \n" + " float fceil = ceil(fpos); \n" + " float fcoord = fpos - fbase; \n" + " return mix("$"(fbase), "$"(fceil), fcoord); \n" + "} \n", + vartypes[PL_VAR_FLOAT][params->comps - 1], name, + size - 1, + arr_lut, arr_lut); + } + } + + if (method == SH_LUT_CUBIC && dims == 3) { + ident_t lin_lut = name; + name = sh_fresh(sh, "lut_tricubic"); + GLSLH("%s "$"(vec3 pos) { \n" + " vec3 scale = vec3(%d.0, %d.0, %d.0); \n" + " vec3 scale_inv = 1.0 / scale; \n" + " pos *= scale; \n" + " vec3 fpos = fract(pos); \n" + " vec3 base = pos - fpos; \n" + " vec3 fpos2 = fpos * fpos; \n" + " vec3 inv = 1.0 - fpos; \n" + " vec3 inv2 = inv * inv; \n" + " vec3 w0 = 1.0/6.0 * inv2 * inv; \n" + " vec3 w1 = 2.0/3.0 - 0.5 * fpos2 * (2.0 - fpos); \n" + " vec3 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n" + " vec3 w3 = 1.0/6.0 * fpos2 * fpos; \n" + " vec3 g0 = w0 + w1; \n" + " vec3 g1 = w2 + w3; \n" + " vec3 h0 = scale_inv * ((w1 / g0) - 1.0 + base); \n" + " vec3 h1 = scale_inv * ((w3 / g1) + 1.0 + base); \n" + " %s c000, c001, c010, c011, c100, c101, c110, c111; \n" + " c000 = "$"(h0); \n" + " c100 = "$"(vec3(h1.x, h0.y, h0.z)); \n" + " c000 = mix(c100, c000, g0.x); \n" + " c010 = "$"(vec3(h0.x, h1.y, h0.z)); \n" + " c110 = "$"(vec3(h1.x, h1.y, h0.z)); \n" + " c010 = mix(c110, c010, g0.x); \n" + " c000 = mix(c010, c000, g0.y); \n" + " c001 = "$"(vec3(h0.x, h0.y, h1.z)); \n" + " c101 = "$"(vec3(h1.x, h0.y, h1.z)); \n" + " c001 = mix(c101, c001, g0.x); \n" + " c011 = "$"(vec3(h0.x, h1.y, h1.z)); \n" + " c111 = "$"(h1); \n" + " c011 = mix(c111, c011, g0.x); \n" + " c001 = mix(c011, c001, g0.y); \n" + " return mix(c001, c000, g0.z); \n" + "} \n", + vartypes[PL_VAR_FLOAT][params->comps - 1], name, + sizes[0] - 1, sizes[1] - 1, sizes[2] - 1, + vartypes[PL_VAR_FLOAT][params->comps - 1], + lin_lut, lin_lut, lin_lut, lin_lut, + lin_lut, lin_lut, lin_lut, lin_lut); + } + + if (method == SH_LUT_TETRAHEDRAL) { + ident_t int_lut = name; + name = sh_fresh(sh, "lut_barycentric"); + GLSLH("%s "$"(vec3 pos) { \n" + // Compute bounding vertices and fractional part + " pos = clamp(pos, 0.0, 1.0) * vec3(%d.0, %d.0, %d.0); \n" + " vec3 base = floor(pos); \n" + " vec3 fpart = pos - base; \n" + // v0 and v3 are always 'black' and 'white', respectively + // v1 and v2 are the closest RGB and CMY vertices, respectively + " ivec3 v0 = ivec3(base), v3 = ivec3(ceil(pos)); \n" + " ivec3 v1 = v0, v2 = v3; \n" + // Table of boolean checks to simplify following math + " bvec3 c = greaterThanEqual(fpart.xyz, fpart.yzx); \n" + " bool c_xy = c.x, c_yx = !c.x, \n" + " c_yz = c.y, c_zy = !c.y, \n" + " c_zx = c.z, c_xz = !c.z; \n" + " vec3 s = fpart.xyz; \n" + " bool cond; \n", + vartypes[PL_VAR_FLOAT][params->comps - 1], name, + sizes[0] - 1, sizes[1] - 1, sizes[2] - 1); + + // Subdivision of the cube into six congruent tetrahedras + // + // For each tetrahedron, test if the point is inside, and if so, update + // the edge vertices. We test all six, even though only one case will + // ever be true, because this avoids branches. + static const char *indices[] = { "xyz", "xzy", "zxy", "zyx", "yzx", "yxz"}; + for (int i = 0; i < PL_ARRAY_SIZE(indices); i++) { + const char x = indices[i][0], y = indices[i][1], z = indices[i][2]; + GLSLH("cond = c_%c%c && c_%c%c; \n" + "s = cond ? fpart.%c%c%c : s; \n" + "v1.%c = cond ? v3.%c : v1.%c; \n" + "v2.%c = cond ? v0.%c : v2.%c; \n", + x, y, y, z, + x, y, z, + x, x, x, + z, z, z); + } + + // Interpolate in barycentric coordinates, with four texel fetches + GLSLH(" return (1.0 - s.x) * "$"(v0) + \n" + " (s.x - s.y) * "$"(v1) + \n" + " (s.y - s.z) * "$"(v2) + \n" + " (s.z) * "$"(v3); \n" + "} \n", + int_lut, int_lut, int_lut, int_lut); + } + + lut->error = false; + pl_cache_obj_free(&obj); + pl_assert(name); + return name; + +error: + lut->error = true; + pl_cache_obj_free(&obj); + return NULL_IDENT; +} diff --git a/src/shaders/meson.build b/src/shaders/meson.build new file mode 100644 index 0000000..746747c --- /dev/null +++ b/src/shaders/meson.build @@ -0,0 +1,23 @@ +shader_sources = [ + 'colorspace.c', + 'custom.c', + 'custom_mpv.c', + 'deinterlacing.c', + 'dithering.c', + 'film_grain.c', + 'film_grain_av1.c', + 'film_grain_h274.c', + 'icc.c', + 'lut.c', + 'sampling.c', +] + +foreach s : shader_sources + sources += custom_target(s, + command: glsl_preproc, + depend_files: glsl_deps, + env: python_env, + input: s, + output: s, + ) +endforeach diff --git a/src/shaders/sampling.c b/src/shaders/sampling.c new file mode 100644 index 0000000..fc10f80 --- /dev/null +++ b/src/shaders/sampling.c @@ -0,0 +1,1198 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include "shaders.h" + +#include <libplacebo/colorspace.h> +#include <libplacebo/shaders/sampling.h> + +const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS }; + +static inline struct pl_tex_params src_params(const struct pl_sample_src *src) +{ + if (src->tex) + return src->tex->params; + + return (struct pl_tex_params) { + .w = src->tex_w, + .h = src->tex_h, + }; +} + +enum filter { + NEAREST = PL_TEX_SAMPLE_NEAREST, + LINEAR = PL_TEX_SAMPLE_LINEAR, + BEST, + FASTEST, +}; + +// Helper function to compute the src/dst sizes and upscaling ratios +static bool setup_src(pl_shader sh, const struct pl_sample_src *src, + ident_t *src_tex, ident_t *pos, ident_t *pt, + float *ratio_x, float *ratio_y, uint8_t *comp_mask, + float *scale, bool resizeable, + enum filter filter) +{ + enum pl_shader_sig sig; + float src_w, src_h; + enum pl_tex_sample_mode sample_mode; + if (src->tex) { + pl_fmt fmt = src->tex->params.format; + bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR; + pl_assert(pl_tex_params_dimension(src->tex->params) == 2); + sig = PL_SHADER_SIG_NONE; + src_w = pl_rect_w(src->rect); + src_h = pl_rect_h(src->rect); + switch (filter) { + case FASTEST: + case NEAREST: + sample_mode = PL_TEX_SAMPLE_NEAREST; + break; + case LINEAR: + if (!can_linear) { + SH_FAIL(sh, "Trying to use a shader that requires linear " + "sampling with a texture whose format (%s) does not " + "support PL_FMT_CAP_LINEAR", fmt->name); + return false; + } + sample_mode = PL_TEX_SAMPLE_LINEAR; + break; + case BEST: + sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST; + break; + } + } else { + pl_assert(src->tex_w && src->tex_h); + sig = PL_SHADER_SIG_SAMPLER; + src_w = src->sampled_w; + src_h = src->sampled_h; + if (filter == BEST || filter == FASTEST) { + sample_mode = src->mode; + } else { + sample_mode = (enum pl_tex_sample_mode) filter; + if (sample_mode != src->mode) { + SH_FAIL(sh, "Trying to use a shader that requires a different " + "filter mode than the external sampler."); + return false; + } + } + } + + src_w = PL_DEF(src_w, src_params(src).w); + src_h = PL_DEF(src_h, src_params(src).h); + pl_assert(src_w && src_h); + + int out_w = PL_DEF(src->new_w, roundf(fabs(src_w))); + int out_h = PL_DEF(src->new_h, roundf(fabs(src_h))); + pl_assert(out_w && out_h); + + if (ratio_x) + *ratio_x = out_w / fabs(src_w); + if (ratio_y) + *ratio_y = out_h / fabs(src_h); + if (scale) + *scale = PL_DEF(src->scale, 1.0); + + if (comp_mask) { + uint8_t tex_mask = 0x0Fu; + if (src->tex) { + // Mask containing only the number of components in the texture + tex_mask = (1 << src->tex->params.format->num_components) - 1; + } + + uint8_t src_mask = src->component_mask; + if (!src_mask) + src_mask = (1 << PL_DEF(src->components, 4)) - 1; + + // Only actually sample components that are both requested and + // available in the texture being sampled + *comp_mask = tex_mask & src_mask; + } + + if (resizeable) + out_w = out_h = 0; + if (!sh_require(sh, sig, out_w, out_h)) + return false; + + if (src->tex) { + pl_rect2df rect = { + .x0 = src->rect.x0, + .y0 = src->rect.y0, + .x1 = src->rect.x0 + src_w, + .y1 = src->rect.y0 + src_h, + }; + + *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode, + "src_tex", &rect, pos, pt); + } else { + if (pt) { + float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h; + if (src->sampler == PL_SAMPLER_RECT) + sx = sy = 1.0; + + *pt = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tex_pt"), + .data = &(float[2]) { sx, sy }, + }); + } + + sh->sampler_type = src->sampler; + + pl_assert(src->format); + switch (src->format) { + case PL_FMT_UNKNOWN: + case PL_FMT_FLOAT: + case PL_FMT_UNORM: + case PL_FMT_SNORM: sh->sampler_prefix = ' '; break; + case PL_FMT_UINT: sh->sampler_prefix = 'u'; break; + case PL_FMT_SINT: sh->sampler_prefix = 's'; break; + case PL_FMT_TYPE_COUNT: + pl_unreachable(); + } + + *src_tex = sh_fresh(sh, "src_tex"); + *pos = sh_fresh(sh, "pos"); + + GLSLH("#define "$" src_tex \n" + "#define "$" pos \n", + *src_tex, *pos); + } + + return true; +} + +void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src, + const struct pl_deband_params *params) +{ + float scale; + ident_t tex, pos, pt; + uint8_t mask; + if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, LINEAR)) + return; + + params = PL_DEF(params, &pl_deband_default_params); + sh_describe(sh, "debanding"); + GLSL("vec4 color; \n" + "// pl_shader_deband \n" + "{ \n" + "vec2 pos = "$", pt = "$"; \n" + "color = textureLod("$", pos, 0.0);\n", + pos, pt, tex); + + mask &= ~0x8u; // ignore alpha channel + uint8_t num_comps = sh_num_comps(mask); + const char *swiz = sh_swizzle(mask); + pl_assert(num_comps <= 3); + if (!num_comps) { + GLSL("color *= "$"; \n" + "} \n", + SH_FLOAT(scale)); + return; + } + + GLSL("#define GET(X, Y) \\\n" + " (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s) \n" + "#define T %s \n", + tex, swiz, sh_float_type(mask)); + + ident_t prng = sh_prng(sh, true, NULL); + GLSL("T avg, diff, bound; \n" + "T res = color.%s; \n" + "vec2 d; \n", + swiz); + + if (params->iterations > 0) { + ident_t radius = sh_const_float(sh, "radius", params->radius); + ident_t threshold = sh_const_float(sh, "threshold", + params->threshold / (1000 * scale)); + + // For each iteration, compute the average at a given distance and + // pick it instead of the color if the difference is below the threshold. + for (int i = 1; i <= params->iterations; i++) { + GLSL(// Compute a random angle and distance + "d = "$".xy * vec2(%d.0 * "$", %f); \n" + "d = d.x * vec2(cos(d.y), sin(d.y)); \n" + // Sample at quarter-turn intervals around the source pixel + "avg = T(0.0); \n" + "avg += GET(+d.x, +d.y); \n" + "avg += GET(-d.x, +d.y); \n" + "avg += GET(-d.x, -d.y); \n" + "avg += GET(+d.x, -d.y); \n" + "avg *= 0.25; \n" + // Compare the (normalized) average against the pixel + "diff = abs(res - avg); \n" + "bound = T("$" / %d.0); \n", + prng, i, radius, M_PI * 2, + threshold, i); + + if (num_comps > 1) { + GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n"); + } else { + GLSL("res = mix(avg, res, diff > bound); \n"); + } + } + } + + // Add some random noise to smooth out residual differences + if (params->grain > 0) { + // Avoid adding grain near true black + GLSL("bound = T(\n"); + for (int c = 0; c < num_comps; c++) { + GLSL("%c"$, c > 0 ? ',' : ' ', + SH_FLOAT(params->grain_neutral[c] / scale)); + } + GLSL("); \n" + "T strength = min(abs(res - bound), "$"); \n" + "res += strength * (T("$") - T(0.5)); \n", + SH_FLOAT(params->grain / (1000.0 * scale)), prng); + } + + GLSL("color.%s = res; \n" + "color *= "$"; \n" + "#undef T \n" + "#undef GET \n" + "} \n", + swiz, SH_FLOAT(scale)); +} + +bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src) +{ + float scale; + ident_t tex, pos; + if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST)) + return false; + + GLSL("// pl_shader_sample_direct \n" + "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n", + SH_FLOAT(scale), tex, pos); + return true; +} + +bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src) +{ + float scale; + ident_t tex, pos; + if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, NEAREST)) + return false; + + sh_describe(sh, "nearest"); + GLSL("// pl_shader_sample_nearest \n" + "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n", + SH_FLOAT(scale), tex, pos); + return true; +} + +bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src) +{ + float scale; + ident_t tex, pos; + if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR)) + return false; + + sh_describe(sh, "bilinear"); + GLSL("// pl_shader_sample_bilinear \n" + "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n", + SH_FLOAT(scale), tex, pos); + return true; +} + +bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + if (rx < 1 || ry < 1) { + PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This " + "will most likely result in nasty aliasing!"); + } + + // Explanation of how bicubic scaling with only 4 texel fetches is done: + // http://www.mate.tue.nl/mate/pdfs/10318.pdf + // 'Efficient GPU-Based Texture Interpolation using Uniform B-Splines' + + sh_describe(sh, "bicubic"); +#pragma GLSL /* pl_shader_sample_bicubic */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 frac = fract(pos * size + vec2(0.5)); \ + vec2 frac2 = frac * frac; \ + vec2 inv = vec2(1.0) - frac; \ + vec2 inv2 = inv * inv; \ + /* compute filter weights directly */ \ + vec2 w0 = 1.0/6.0 * inv2 * inv; \ + vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \ + vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \ + vec2 w3 = 1.0/6.0 * frac2 * frac; \ + vec4 g = vec4(w0 + w1, w2 + w3); \ + vec4 h = vec4(w1, w3) / g + inv.xyxy; \ + h.xy -= vec2(2.0); \ + /* sample four corners, then interpolate */ \ + vec4 p = pos.xyxy + $pt.xyxy * h; \ + vec4 c00 = textureLod($tex, p.xy, 0.0); \ + vec4 c01 = textureLod($tex, p.xw, 0.0); \ + vec4 c0 = mix(c01, c00, g.y); \ + vec4 c10 = textureLod($tex, p.zy, 0.0); \ + vec4 c11 = textureLod($tex, p.zw, 0.0); \ + vec4 c1 = mix(c11, c10, g.y); \ + color = ${float:scale} * mix(c1, c0, g.x); \ + } + + return true; +} + +bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + if (rx < 1 || ry < 1) { + PL_TRACE(sh, "Using fast hermite sampling when downscaling. This " + "will most likely result in nasty aliasing!"); + } + + sh_describe(sh, "hermite"); +#pragma GLSL /* pl_shader_sample_hermite */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 frac = fract(pos * size + vec2(0.5)); \ + pos += $pt * (smoothstep(0.0, 1.0, frac) - frac); \ + color = ${float:scale} * textureLod($tex, pos, 0.0); \ + } + + return true; +} + +bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + if (rx < 1 || ry < 1) { + PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This " + "will most likely result in nasty aliasing!"); + } + + sh_describe(sh, "gaussian"); +#pragma GLSL /* pl_shader_sample_gaussian */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 off = -fract(pos * size + vec2(0.5)); \ + vec2 off2 = -2.0 * off * off; \ + /* compute gaussian weights */ \ + vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0)); \ + vec2 w1 = exp(off2); \ + vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0)); \ + vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0)); \ + vec4 g = vec4(w0 + w1, w2 + w3); \ + vec4 h = vec4(w1, w3) / g; \ + h.xy -= vec2(1.0); \ + h.zw += vec2(1.0); \ + g.xy /= g.xy + g.zw; /* explicitly normalize */ \ + /* sample four corners, then interpolate */ \ + vec4 p = pos.xyxy + $pt.xyxy * (h + off.xyxy); \ + vec4 c00 = textureLod($tex, p.xy, 0.0); \ + vec4 c01 = textureLod($tex, p.xw, 0.0); \ + vec4 c0 = mix(c01, c00, g.y); \ + vec4 c10 = textureLod($tex, p.zy, 0.0); \ + vec4 c11 = textureLod($tex, p.zw, 0.0); \ + vec4 c1 = mix(c11, c10, g.y); \ + color = ${float:scale} * mix(c1, c0, g.x); \ + } + + return true; +} + +bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src, + float threshold) +{ + ident_t tex, pos, pt; + float rx, ry, scale; + if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR)) + return false; + + threshold = PL_CLAMP(threshold, 0.0f, 0.5f); + sh_describe(sh, "oversample"); + #pragma GLSL /* pl_shader_sample_oversample */ \ + vec4 color; \ + { \ + vec2 pos = $pos; \ + vec2 size = vec2(textureSize($tex, 0)); \ + /* Round the position to the nearest pixel */ \ + vec2 fcoord = fract(pos * size - vec2(0.5)); \ + float rx = ${dynamic float:rx}; \ + float ry = ${dynamic float:ry}; \ + vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry); \ + coeff = clamp(coeff + vec2(0.5), 0.0, 1.0); \ + @if (threshold > 0) { \ + float thresh = ${float:threshold}; \ + coeff = mix(coeff, vec2(0.0), \ + lessThan(coeff, vec2(thresh))); \ + coeff = mix(coeff, vec2(1.0), \ + greaterThan(coeff, vec2(1.0 - thresh))); \ + @} \ + \ + /* Compute the right output blend of colors */ \ + pos += (coeff - fcoord) * $pt; \ + color = ${float:scale} * textureLod($tex, pos, 0.0); \ + } + + return true; +} + +static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg, + const char *stage, float rx, float ry) +{ + const char *dir; + if (rx > 1 && ry > 1) { + dir = "up"; + } else if (rx < 1 && ry < 1) { + dir = "down"; + } else if (rx == 1 && ry == 1) { + dir = "noop"; + } else { + dir = "ana"; + } + + if (cfg->name) { + sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name); + } else if (cfg->window) { + sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir, + PL_DEF(cfg->kernel->name, "unknown"), + PL_DEF(cfg->window->name, "unknown")); + } else { + sh_describef(sh, "%s %sscaling (%s)", stage, dir, + PL_DEF(cfg->kernel->name, "unknown")); + } +} + +// Subroutine for computing and adding an individual texel contribution +// If `in` is NULL, samples directly +// If `in` is set, takes the pixel from inX[idx] where X is the component, +// `in` is the given identifier, and `idx` must be defined by the caller +static void polar_sample(pl_shader sh, pl_filter filter, + ident_t tex, ident_t lut, ident_t radius, + int x, int y, uint8_t comp_mask, ident_t in, + bool use_ar, ident_t scale) +{ + // Since we can't know the subpixel position in advance, assume a + // worst case scenario + int yy = y > 0 ? y-1 : y; + int xx = x > 0 ? x-1 : x; + float dmin = sqrt(xx*xx + yy*yy); + // Skip samples definitely outside the radius + if (dmin >= filter->radius) + return; + + // Check for samples that might be skippable + bool maybe_skippable = dmin >= filter->radius - M_SQRT2; + + // Check for samples that definitely won't contribute to anti-ringing + const float ar_radius = filter->radius_zero; + use_ar &= dmin < ar_radius; + +#pragma GLSL \ + offset = ivec2(${const int: x}, ${const int: y}); \ + d = length(vec2(offset) - fcoord); \ + @if (maybe_skippable) \ + if (d < $radius) { \ + w = $lut(d * 1.0 / $radius); \ + wsum += w; \ + @if (in != NULL_IDENT) { \ + @for (c : comp_mask) \ + c[@c] = ${in}_@c[idx]; \ + @} else { \ + c = textureLod($tex, base + pt * vec2(offset), 0.0); \ + @} \ + @for (c : comp_mask) \ + color[@c] += w * c[@c]; \ + @if (use_ar) { \ + if (d <= ${const float: ar_radius}) { \ + @for (c : comp_mask) { \ + cc = vec2($scale * c[@c]); \ + cc.x = 1.0 - cc.x; \ + ww = cc + vec2(0.10); \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = ww * ww; \ + ww = w * ww; \ + ar@c += ww * cc; \ + wwsum@c += ww; \ + @} \ + } \ + @} \ + @if (maybe_skippable) \ + } +} + +struct sh_sampler_obj { + pl_filter filter; + pl_shader_obj lut; + pl_shader_obj pass2; // for pl_shader_sample_ortho +}; + +#define SCALER_LUT_SIZE 256 +#define SCALER_LUT_CUTOFF 1e-3f + +static void sh_sampler_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_sampler_obj *obj = ptr; + pl_shader_obj_destroy(&obj->lut); + pl_shader_obj_destroy(&obj->pass2); + pl_filter_free(&obj->filter); + *obj = (struct sh_sampler_obj) {0}; +} + +static void fill_polar_lut(void *data, const struct sh_lut_params *params) +{ + const struct sh_sampler_obj *obj = params->priv; + pl_filter filt = obj->filter; + + pl_assert(params->width == filt->params.lut_entries && params->comps == 1); + memcpy(data, filt->weights, params->width * sizeof(float)); +} + +bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src, + const struct pl_sample_filter_params *params) +{ + pl_assert(params); + if (!params->filter.polar) { + SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?"); + return false; + } + + uint8_t cmask; + float rx, ry, scalef; + ident_t src_tex, pos, pt, scale; + if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST)) + return false; + + struct sh_sampler_obj *obj; + obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj, + sh_sampler_uninit); + if (!obj) + return false; + + float inv_scale = 1.0 / PL_MIN(rx, ry); + inv_scale = PL_MAX(inv_scale, 1.0); + if (params->no_widening) + inv_scale = 1.0; + scale = sh_const_float(sh, "scale", scalef); + + struct pl_filter_config cfg = params->filter; + cfg.antiring = PL_DEF(cfg.antiring, params->antiring); + cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale; + bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg); + if (update) { + pl_filter_free(&obj->filter); + obj->filter = pl_filter_generate(sh->log, pl_filter_params( + .config = cfg, + .lut_entries = SCALER_LUT_SIZE, + .cutoff = SCALER_LUT_CUTOFF, + )); + + if (!obj->filter) { + // This should never happen, but just in case .. + SH_FAIL(sh, "Failed initializing polar filter!"); + return false; + } + } + + describe_filter(sh, &cfg, "polar", rx, ry); + GLSL("// pl_shader_sample_polar \n" + "vec4 color = vec4(0.0); \n" + "{ \n" + "vec2 pos = "$", pt = "$"; \n" + "vec2 size = vec2(textureSize("$", 0)); \n" + "vec2 fcoord = fract(pos * size - vec2(0.5)); \n" + "vec2 base = pos - pt * fcoord; \n" + "vec2 center = base + pt * vec2(0.5); \n" + "ivec2 offset; \n" + "float w, d, wsum = 0.0; \n" + "int idx; \n" + "vec4 c; \n", + pos, pt, src_tex); + + bool use_ar = cfg.antiring > 0; + if (use_ar) { +#pragma GLSL \ + vec2 ww, cc; \ + @for (c : cmask) \ + vec2 ar@c = vec2(0.0), wwsum@c = vec2(0.0); + } + + int bound = ceil(obj->filter->radius); + int offset = bound - 1; // padding top/left + int padding = offset + bound; // total padding + + // Determined experimentally on modern AMD and Nvidia hardware. 32 is a + // good tradeoff for the horizontal work group size. Apart from that, + // just use as many threads as possible. + const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw; + + // We need to sample everything from base_min to base_max, so make sure we + // have enough room in shmem. The extra margin on the ceilf guards against + // floating point inaccuracy on near-integer scaling ratios. + const float margin = 1e-5; + int iw = (int) ceilf(bw / rx - margin) + padding + 1, + ih = (int) ceilf(bh / ry - margin) + padding + 1; + int sizew = iw, sizeh = ih; + + pl_gpu gpu = SH_GPU(sh); + bool dynamic_size = SH_PARAMS(sh).dynamic_constants || + !gpu || !gpu->limits.array_size_constants; + if (dynamic_size) { + // Overallocate the array slightly to reduce recompilation overhead + sizew = PL_ALIGN2(sizew, 8); + sizeh = PL_ALIGN2(sizeh, 8); + } + + int num_comps = __builtin_popcount(cmask); + int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float); + bool is_compute = !params->no_compute && sh_glsl(sh).compute && + sh_try_compute(sh, bw, bh, false, shmem_req); + + // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by + // much, and it's catastrophically slow on other platforms. + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->lut, + .lut_type = SH_LUT_TEXTURE, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_LINEAR, + .width = SCALER_LUT_SIZE, + .comps = 1, + .update = update, + .fill = fill_polar_lut, + .priv = obj, + )); + + if (!lut) { + SH_FAIL(sh, "Failed initializing polar LUT!"); + return false; + } + + ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius); + ident_t in = sh_fresh(sh, "in"); + + if (is_compute) { + + // Compute shader kernel + GLSL("uvec2 base_id = uvec2(0u); \n"); + if (src->rect.x0 > src->rect.x1) + GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n"); + if (src->rect.y0 > src->rect.y1) + GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n"); + + GLSLH("shared vec2 "$"_base; \n", in); + GLSL("if (gl_LocalInvocationID.xy == base_id) \n" + " "$"_base = base; \n" + "barrier(); \n" + "ivec2 rel = ivec2(round((base - "$"_base) * size)); \n", + in, in); + + ident_t sizew_c = sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_SINT, + .compile_time = true, + .name = "sizew", + .data = &sizew, + }); + + ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_SINT, + .compile_time = true, + .name = "sizeh", + .data = &sizeh, + }); + + ident_t iw_c = sizew_c, ih_c = sizeh_c; + if (dynamic_size) { + iw_c = sh_const_int(sh, "iw", iw); + ih_c = sh_const_int(sh, "ih", ih); + } + + // Load all relevant texels into shmem + GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) { \n" + "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) { \n" + "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0); \n", + ih_c, bh, iw_c, bw, src_tex, in, offset, offset); + + for (uint8_t comps = cmask; comps;) { + uint8_t c = __builtin_ctz(comps); + GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c); + GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c); + comps &= ~(1 << c); + } + + GLSL("}} \n" + "barrier(); \n"); + + // Dispatch the actual samples + for (int y = 1 - bound; y <= bound; y++) { + for (int x = 1 - bound; x <= bound; x++) { + GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n", + sizew_c, sizew_c, y + offset, x + offset); + polar_sample(sh, obj->filter, src_tex, lut, radius_c, + x, y, cmask, in, use_ar, scale); + } + } + } else { + // Fragment shader sampling + for (uint8_t comps = cmask; comps;) { + uint8_t c = __builtin_ctz(comps); + GLSL("vec4 "$"_%d; \n", in, c); + comps &= ~(1 << c); + } + + // For maximum efficiency, we want to use textureGather() if + // possible, rather than direct sampling. Since this is not + // always possible/sensible, we need to possibly intermix gathering + // with regular sampling. This requires keeping track of which + // pixels in the next row were already gathered by the previous + // row. + uint32_t gathered_cur = 0x0, gathered_next = 0x0; + const float radius2 = PL_SQUARE(obj->filter->radius); + const int base = bound - 1; + + if (base + bound >= 8 * sizeof(gathered_cur)) { + SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!", + obj->filter->radius); + return false; + } + + for (int y = 1 - bound; y <= bound; y++) { + for (int x = 1 - bound; x <= bound; x++) { + // Skip already gathered texels + uint32_t bit = 1llu << (base + x); + if (gathered_cur & bit) + continue; + + // Using texture gathering is only more efficient than direct + // sampling in the case where we expect to be able to use all + // four gathered texels, without having to discard any. So + // only do it if we suspect it will be a win rather than a + // loss. + int xx = x*x, xx1 = (x+1)*(x+1); + int yy = y*y, yy1 = (y+1)*(y+1); + bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2; + use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset; + use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset; + use_gather &= !src->tex || src->tex->params.format->gatherable; + + // Gathering from components other than the R channel requires + // support for GLSL 400, which introduces the overload of + // textureGather* that allows specifying the component. + // + // This is also the minimum requirement if we don't know the + // texture format capabilities, for the sampler2D interface + if (cmask != 0x1 || !src->tex) + use_gather &= sh_glsl(sh).version >= 400; + + if (!use_gather) { + // Switch to direct sampling instead + polar_sample(sh, obj->filter, src_tex, lut, radius_c, + x, y, cmask, NULL_IDENT, use_ar, scale); + continue; + } + + // Gather the four surrounding texels simultaneously + for (uint8_t comps = cmask; comps;) { + uint8_t c = __builtin_ctz(comps); + if (x || y) { + if (c) { + GLSL($"_%d = textureGatherOffset("$", " + "center, ivec2(%d, %d), %d); \n", + in, c, src_tex, x, y, c); + } else { + GLSL($"_0 = textureGatherOffset("$", " + "center, ivec2(%d, %d)); \n", + in, src_tex, x, y); + } + } else { + if (c) { + GLSL($"_%d = textureGather("$", center, %d); \n", + in, c, src_tex, c); + } else { + GLSL($"_0 = textureGather("$", center); \n", + in, src_tex); + } + } + comps &= ~(1 << c); + } + + // Mix in all of the points with their weights + for (int p = 0; p < 4; p++) { + // The four texels are gathered counterclockwise starting + // from the bottom left + static const int xo[4] = {0, 1, 1, 0}; + static const int yo[4] = {1, 1, 0, 0}; + if (x+xo[p] > bound || y+yo[p] > bound) + continue; // next subpixel + + GLSL("idx = %d;\n", p); + polar_sample(sh, obj->filter, src_tex, lut, radius_c, + x+xo[p], y+yo[p], cmask, in, use_ar, scale); + } + + // Mark the other next row's pixels as already gathered + gathered_next |= bit | (bit << 1); + x++; // skip adjacent pixel + } + + // Prepare for new row + gathered_cur = gathered_next; + gathered_next = 0; + } + } + +#pragma GLSL \ + color = $scale / wsum * color; \ + @if (use_ar) { \ + @for (c : cmask) { \ + ww = ar@c / wwsum@c; \ + ww.x = 1.0 - ww.x; \ + w = clamp(color[@c], ww.x, ww.y); \ + w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y); \ + color[@c] = mix(color[@c], w, ${float:cfg.antiring}); \ + @} \ + @} \ + @if (!(cmask & (1 << PL_CHANNEL_A))) \ + color.a = 1.0; \ + } + + return true; +} + +static void fill_ortho_lut(void *data, const struct sh_lut_params *params) +{ + const struct sh_sampler_obj *obj = params->priv; + pl_filter filt = obj->filter; + + if (filt->radius == filt->radius_zero) { + // Main lobe covers entire radius, so all weights are positive, meaning + // we can use the linear resampling trick + for (int n = 0; n < SCALER_LUT_SIZE; n++) { + const float *weights = filt->weights + n * filt->row_stride; + float *row = (float *) data + n * filt->row_stride; + pl_assert(filt->row_size % 2 == 0); + for (int i = 0; i < filt->row_size; i += 2) { + const float w0 = weights[i], w1 = weights[i+1]; + assert(w0 + w1 >= 0.0f); + row[i] = w0 + w1; + row[i+1] = w1 / (w0 + w1); + } + } + } else { + size_t entries = SCALER_LUT_SIZE * filt->row_stride; + pl_assert(params->width * params->height * params->comps == entries); + memcpy(data, filt->weights, entries * sizeof(float)); + } +} + +enum { + SEP_VERT = 0, + SEP_HORIZ, + SEP_PASSES +}; + +bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src, + const struct pl_sample_filter_params *params) +{ + pl_assert(params); + if (params->filter.polar) { + SH_FAIL(sh, "Trying to use separated sampling with a polar filter?"); + return false; + } + + pl_gpu gpu = SH_GPU(sh); + pl_assert(gpu); + + uint8_t comps; + float ratio[SEP_PASSES], scale; + ident_t src_tex, pos, pt; + if (!setup_src(sh, src, &src_tex, &pos, &pt, + &ratio[SEP_HORIZ], &ratio[SEP_VERT], + &comps, &scale, false, LINEAR)) + return false; + + + int pass; + if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) { + pass = SEP_VERT; + } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) { + pass = SEP_HORIZ; + } else { + SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a " + "pl_sample_src that requires scaling in multiple directions " + "(rx=%f, ry=%f), this is not possible!", + ratio[SEP_HORIZ], ratio[SEP_VERT]); + return false; + } + + // We can store a separate sampler object per dimension, so dispatch the + // right one. This is needed for two reasons: + // 1. Anamorphic content can have a different scaling ratio for each + // dimension. In particular, you could be upscaling in one and + // downscaling in the other. + // 2. After fixing the source for `setup_src`, we lose information about + // the scaling ratio of the other component. (Although this is only a + // minor reason and could easily be changed with some boilerplate) + struct sh_sampler_obj *obj; + obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, + struct sh_sampler_obj, sh_sampler_uninit); + if (!obj) + return false; + + if (pass != 0) { + obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER, + struct sh_sampler_obj, sh_sampler_uninit); + assert(obj); + } + + float inv_scale = 1.0 / ratio[pass]; + inv_scale = PL_MAX(inv_scale, 1.0); + if (params->no_widening) + inv_scale = 1.0; + + struct pl_filter_config cfg = params->filter; + cfg.antiring = PL_DEF(cfg.antiring, params->antiring); + cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale; + bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg); + + if (update) { + pl_filter_free(&obj->filter); + obj->filter = pl_filter_generate(sh->log, pl_filter_params( + .config = cfg, + .lut_entries = SCALER_LUT_SIZE, + .max_row_size = gpu->limits.max_tex_2d_dim / 4, + .row_stride_align = 4, + )); + + if (!obj->filter) { + // This should never happen, but just in case .. + SH_FAIL(sh, "Failed initializing separated filter!"); + return false; + } + } + + int N = obj->filter->row_size; // number of samples to convolve + int width = obj->filter->row_stride / 4; // width of the LUT texture + ident_t lut = sh_lut(sh, sh_lut_params( + .object = &obj->lut, + .var_type = PL_VAR_FLOAT, + .method = SH_LUT_LINEAR, + .width = width, + .height = SCALER_LUT_SIZE, + .comps = 4, + .update = update, + .fill = fill_ortho_lut, + .priv = obj, + )); + if (!lut) { + SH_FAIL(sh, "Failed initializing separated LUT!"); + return false; + } + + const int dir[SEP_PASSES][2] = { + [SEP_HORIZ] = {1, 0}, + [SEP_VERT] = {0, 1}, + }; + + static const char *names[SEP_PASSES] = { + [SEP_HORIZ] = "ortho (horiz)", + [SEP_VERT] = "ortho (vert)", + }; + + describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]); + + float denom = PL_MAX(1, width - 1); // avoid division by zero + bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0; + bool use_linear = obj->filter->radius == obj->filter->radius_zero; + use_ar &= !use_linear; // filter has no negative weights + +#pragma GLSL /* pl_shader_sample_ortho */ \ + vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \ + { \ + vec2 pos = $pos, pt = $pt; \ + vec2 size = vec2(textureSize($src_tex, 0)); \ + vec2 dir = vec2(${const float:dir[pass][0]}, ${const float: dir[pass][1]}); \ + pt *= dir; \ + vec2 fcoord2 = fract(pos * size - vec2(0.5)); \ + float fcoord = dot(fcoord2, dir); \ + vec2 base = pos - fcoord * pt - pt * vec2(${const float: N / 2 - 1}); \ + vec4 ws; \ + float off; \ + ${vecType: comps} c, ca = ${vecType: comps}(0.0); \ + @if (use_ar) { \ + ${vecType: comps} hi = ${vecType: comps}(0.0); \ + ${vecType: comps} lo = ${vecType: comps}(1e9); \ + @} \ + @for (n < N) { \ + @if @(n % 4 == 0) \ + ws = $lut(vec2(float(@n / 4) / ${const float: denom}, fcoord)); \ + @if @(vars.use_ar && (n == vars.n / 2 - 1 || n == vars.n / 2)) { \ + c = textureLod($src_tex, base + pt * @n.0, 0.0).${swizzle: comps}; \ + ca += ws[@n % 4] * c; \ + lo = min(lo, c); \ + hi = max(hi, c); \ + @} else { \ + @if (use_linear) { \ + @if @(n % 2 == 0) { \ + off = @n.0 + ws[@n % 4 + 1]; \ + ca += ws[@n % 4] * textureLod($src_tex, base + pt * off, \ + 0.0).${swizzle: comps}; \ + @} \ + @} else { \ + ca += ws[@n % 4] * textureLod($src_tex, base + pt * @n.0, \ + 0.0).${swizzle: comps}; \ + @} \ + @} \ + @} \ + @if (use_ar) \ + ca = mix(ca, clamp(ca, lo, hi), ${float: cfg.antiring}); \ + color.${swizzle: comps} = ${float: scale} * ca; \ + } + + return true; +} + +const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS }; + +void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h, + const struct pl_distort_params *params) +{ + pl_assert(params); + if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h)) + return; + + const int src_w = src_tex->params.w, src_h = src_tex->params.h; + float rx = 1.0f, ry = 1.0f; + if (src_w > src_h) { + ry = (float) src_h / src_w; + } else { + rx = (float) src_w / src_h; + } + + // Map from texel coordinates [0,1]² to aspect-normalized representation + const pl_transform2x2 tex2norm = { + .mat.m = { + { 2 * rx, 0 }, + { 0, -2 * ry }, + }, + .c = { -rx, ry }, + }; + + // Map from aspect-normalized representation to canvas coords [-1,1]² + const float sx = params->unscaled ? (float) src_w / out_w : 1.0f; + const float sy = params->unscaled ? (float) src_h / out_h : 1.0f; + const pl_transform2x2 norm2canvas = { + .mat.m = { + { sx / rx, 0 }, + { 0, sy / ry }, + }, + }; + + struct pl_transform2x2 transform = params->transform; + pl_transform2x2_mul(&transform, &tex2norm); + pl_transform2x2_rmul(&norm2canvas, &transform); + + if (params->constrain) { + pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) { + .x1 = 1, .y1 = 1, + }); + const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f); + pl_transform2x2_scale(&transform, 2.0f / k); + }; + + // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond + // to normal mathematical axis conventions + static const pl_rect2df canvas = { + .x0 = -1.0f, .x1 = 1.0f, + .y0 = 1.0f, .y1 = -1.0f, + }; + + ident_t pos = sh_attr_vec2(sh, "pos", &canvas); + ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode, + PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt); + + // Bind the inverse of the tex2canvas transform (i.e. canvas2tex) + pl_transform2x2_invert(&transform); + ident_t tf = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat2("tf"), + .data = PL_TRANSPOSE_2X2(transform.mat.m), + }); + + ident_t tf_c = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_vec2("tf_c"), + .data = transform.c, + }); + + // See pl_shader_sample_bicubic + sh_describe(sh, "distortion"); +#pragma GLSL /* pl_shader_sample_distort */ \ + vec4 color; \ + { \ + vec2 pos = $tf * $pos + $tf_c; \ + vec2 pt = $pt; \ + @if (params->bicubic) { \ + vec2 size = vec2(textureSize($tex, 0)); \ + vec2 frac = fract(pos * size + vec2(0.5)); \ + vec2 frac2 = frac * frac; \ + vec2 inv = vec2(1.0) - frac; \ + vec2 inv2 = inv * inv; \ + vec2 w0 = 1.0/6.0 * inv2 * inv; \ + vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \ + vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \ + vec2 w3 = 1.0/6.0 * frac2 * frac; \ + vec4 g = vec4(w0 + w1, w2 + w3); \ + vec4 h = vec4(w1, w3) / g + inv.xyxy; \ + h.xy -= vec2(2.0); \ + vec4 p = pos.xyxy + pt.xyxy * h; \ + vec4 c00 = textureLod($tex, p.xy, 0.0); \ + vec4 c01 = textureLod($tex, p.xw, 0.0); \ + vec4 c0 = mix(c01, c00, g.y); \ + vec4 c10 = textureLod($tex, p.zy, 0.0); \ + vec4 c11 = textureLod($tex, p.zw, 0.0); \ + vec4 c1 = mix(c11, c10, g.y); \ + color = mix(c1, c0, g.x); \ + @} else { \ + color = texture($tex, pos); \ + @} \ + @if (params->alpha_mode) { \ + vec2 border = min(pos, vec2(1.0) - pos); \ + border = smoothstep(vec2(0.0), pt, border); \ + @if (params->alpha_mode == PL_ALPHA_PREMULTIPLIED) \ + color.rgba *= border.x * border.y; \ + @else \ + color.a *= border.x * border.y; \ + @} \ + } + +} |