From ff6e3c025658a5fa1affd094f220b623e7e1b24b Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 22:38:23 +0200 Subject: Adding upstream version 6.338.2. Signed-off-by: Daniel Baumann --- src/shaders/dithering.c | 527 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 527 insertions(+) create mode 100644 src/shaders/dithering.c (limited to 'src/shaders/dithering.c') diff --git a/src/shaders/dithering.c b/src/shaders/dithering.c new file mode 100644 index 0000000..4485d11 --- /dev/null +++ b/src/shaders/dithering.c @@ -0,0 +1,527 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see . + */ + +#include +#include "shaders.h" + +#include + +const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS }; + +struct sh_dither_obj { + pl_shader_obj lut; +}; + +static void sh_dither_uninit(pl_gpu gpu, void *ptr) +{ + struct sh_dither_obj *obj = ptr; + pl_shader_obj_destroy(&obj->lut); + *obj = (struct sh_dither_obj) {0}; +} + +static void fill_dither_matrix(void *data, const struct sh_lut_params *params) +{ + pl_assert(params->width > 0 && params->height > 0 && params->comps == 1); + + const struct pl_dither_params *dpar = params->priv; + switch (dpar->method) { + case PL_DITHER_ORDERED_LUT: + pl_assert(params->width == params->height); + pl_generate_bayer_matrix(data, params->width); + return; + + case PL_DITHER_BLUE_NOISE: + pl_assert(params->width == params->height); + pl_generate_blue_noise(data, params->width); + return; + + case PL_DITHER_ORDERED_FIXED: + case PL_DITHER_WHITE_NOISE: + case PL_DITHER_METHOD_COUNT: + return; + } + + pl_unreachable(); +} + +static bool dither_method_is_lut(enum pl_dither_method method) +{ + switch (method) { + case PL_DITHER_BLUE_NOISE: + case PL_DITHER_ORDERED_LUT: + return true; + case PL_DITHER_ORDERED_FIXED: + case PL_DITHER_WHITE_NOISE: + return false; + case PL_DITHER_METHOD_COUNT: + break; + } + + pl_unreachable(); +} + +static inline float approx_gamma(enum pl_color_transfer trc) +{ + switch (trc) { + case PL_COLOR_TRC_UNKNOWN: return 1.0f; + case PL_COLOR_TRC_LINEAR: return 1.0f; + case PL_COLOR_TRC_PRO_PHOTO:return 1.8f; + case PL_COLOR_TRC_GAMMA18: return 1.8f; + case PL_COLOR_TRC_GAMMA20: return 2.0f; + case PL_COLOR_TRC_GAMMA24: return 2.4f; + case PL_COLOR_TRC_GAMMA26: return 2.6f; + case PL_COLOR_TRC_ST428: return 2.6f; + case PL_COLOR_TRC_GAMMA28: return 2.8f; + + case PL_COLOR_TRC_SRGB: + case PL_COLOR_TRC_BT_1886: + case PL_COLOR_TRC_GAMMA22: + return 2.2f; + + case PL_COLOR_TRC_PQ: + case PL_COLOR_TRC_HLG: + case PL_COLOR_TRC_V_LOG: + case PL_COLOR_TRC_S_LOG1: + case PL_COLOR_TRC_S_LOG2: + return 2.0f; // TODO: handle this better + + case PL_COLOR_TRC_COUNT: break; + } + + pl_unreachable(); +} + +void pl_shader_dither(pl_shader sh, int new_depth, + pl_shader_obj *dither_state, + const struct pl_dither_params *params) +{ + if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) + return; + + if (new_depth <= 0 || new_depth > 256) { + PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth); + return; + } + + sh_describef(sh, "dithering (%d bits)", new_depth); + GLSL("// pl_shader_dither \n" + "{ \n" + "float bias; \n"); + + params = PL_DEF(params, &pl_dither_default_params); + if (params->lut_size < 0 || params->lut_size > 8) { + SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size); + return; + } + + enum pl_dither_method method = params->method; + ident_t lut = NULL_IDENT; + int lut_size = 0; + + if (dither_method_is_lut(method)) { + if (!dither_state) { + PL_WARN(sh, "LUT-based dither method specified but no dither state " + "object given, falling back to non-LUT based methods."); + goto fallback; + } + + struct sh_dither_obj *obj; + obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER, + struct sh_dither_obj, sh_dither_uninit); + if (!obj) + goto fallback; + + bool cache = method == PL_DITHER_BLUE_NOISE; + lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size); + lut = sh_lut(sh, sh_lut_params( + .object = &obj->lut, + .var_type = PL_VAR_FLOAT, + .width = lut_size, + .height = lut_size, + .comps = 1, + .fill = fill_dither_matrix, + .signature = (CACHE_KEY_DITHER ^ method) * lut_size, + .cache = cache ? SH_CACHE(sh) : NULL, + .priv = (void *) params, + )); + if (!lut) + goto fallback; + } + + goto done; + +fallback: + method = PL_DITHER_ORDERED_FIXED; + // fall through + +done: ; + + int size = 0; + if (lut) { + size = lut_size; + } else if (method == PL_DITHER_ORDERED_FIXED) { + size = 16; // hard-coded size + } + + if (size) { + // Transform the screen position to the cyclic range [0,1) + GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size)); + + if (params->temporal) { + int phase = SH_PARAMS(sh).index % 8; + float r = phase * (M_PI / 2); // rotate + float m = phase < 4 ? 1 : -1; // mirror + float mat[2][2] = { + {cos(r), -sin(r) }, + {sin(r) * m, cos(r) * m}, + }; + + ident_t rot = sh_var(sh, (struct pl_shader_var) { + .var = pl_var_mat2("dither_rot"), + .data = &mat[0][0], + .dynamic = true, + }); + GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot); + } + } + + switch (method) { + case PL_DITHER_WHITE_NOISE: { + ident_t prng = sh_prng(sh, params->temporal, NULL); + GLSL("bias = "$".x;\n", prng); + break; + } + + case PL_DITHER_ORDERED_FIXED: + // Bitwise ordered dither using only 32-bit uints + GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u; \n" + // Bitwise merge (morton number) + "xy.x = xy.x ^ xy.y; \n" + "xy = (xy | xy << 2) & uvec2(0x33333333); \n" + "xy = (xy | xy << 1) & uvec2(0x55555555); \n" + // Bitwise inversion + "uint b = xy.x + (xy.y << 1); \n" + "b = (b * 0x0802u & 0x22110u) | \n" + " (b * 0x8020u & 0x88440u); \n" + "b = 0x10101u * b; \n" + "b = (b >> 16) & 0xFFu; \n" + // Generate bias value + "bias = float(b) * 1.0/256.0; \n"); + break; + + case PL_DITHER_BLUE_NOISE: + case PL_DITHER_ORDERED_LUT: + pl_assert(lut); + GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size)); + break; + + case PL_DITHER_METHOD_COUNT: + pl_unreachable(); + } + + // Scale factor for dither rounding + GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1); + + const float gamma = approx_gamma(params->transfer); + if (gamma != 1.0f && new_depth <= 4) { + GLSL("const float gamma = "$"; \n" + "vec4 color_lin = pow(color, vec4(gamma)); \n", + SH_FLOAT(gamma)); + + if (new_depth == 1) { + // Special case for bit depth 1 dithering, in this case we can just + // ignore the low/high rounding because we know we are always + // dithering between 0.0 and 1.0. + GLSL("const vec4 low = vec4(0.0); \n" + "const vec4 high = vec4(1.0); \n" + "vec4 offset = color_lin; \n"); + } else { + // Linearize the low, high and current color values + GLSL("vec4 low = floor(color * scale) / scale; \n" + "vec4 high = ceil(color * scale) / scale; \n" + "vec4 low_lin = pow(low, vec4(gamma)); \n" + "vec4 high_lin = pow(high, vec4(gamma)); \n" + "vec4 range = high_lin - low_lin; \n" + "vec4 offset = (color_lin - low_lin) / \n" + " max(range, 1e-6); \n"); + } + + // Mix in the correct ratio corresponding to the offset and bias + GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n"); + } else { + // Approximate each gamma segment as a straight line, this simplifies + // the process of dithering down to a single scale and (biased) round. + GLSL("color = scale * color + vec4(bias); \n" + "color = floor(color) * (1.0 / scale); \n"); + } + + GLSL("} \n"); +} + +/* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that +// will be affected by the current column. +static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k) +{ + int ret = 0; + for (int y = 0; y <= PL_EDF_MAX_DY; y++) { + for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { + if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) { + int shifted_x = x + y * k->shift; + + // The shift mapping guarantees current column (or left of it) + // won't be affected by error diffusion. + assert(shifted_x > 0); + + ret = PL_MAX(ret, shifted_x); + } + } + } + return ret; +} + +size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel, + int height) +{ + // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors + // propagated out from bottom side. + int rows = height + PL_EDF_MAX_DY; + int shifted_columns = compute_rightmost_shifted_column(kernel) + 1; + + // The shared memory is an array of size rows*shifted_columns. Each element + // is a single uint for three RGB component. + return rows * shifted_columns * sizeof(uint32_t); +} + +bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params) +{ + const int width = params->input_tex->params.w, height = params->input_tex->params.h; + const struct pl_glsl_version glsl = sh_glsl(sh); + const struct pl_error_diffusion_kernel *kernel = + PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite); + + pl_assert(params->output_tex->params.w == width); + pl_assert(params->output_tex->params.h == height); + if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height)) + return false; + + if (params->new_depth <= 0 || params->new_depth > 256) { + PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth); + return false; + } + + // The parallel error diffusion works by applying the shift mapping first. + // Taking the Floyd and Steinberg algorithm for example. After applying + // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are + // propagated into the next few columns, which makes parallel processing on + // the same column possible. + // + // X 7/16 X 7/16 + // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16 + + // Figuring out the size of rectangle containing all shifted pixels. + // The rectangle height is not changed. + int shifted_width = width + (height - 1) * kernel->shift; + + // We process all pixels from the shifted rectangles column by column, with + // a single global work group of size |block_size|. + // Figuring out how many block are required to process all pixels. We need + // this explicitly to make the number of barrier() calls match. + int block_size = PL_MIN(glsl.max_group_threads, height); + int blocks = PL_DIV_UP(height * shifted_width, block_size); + + // If we figure out how many of the next columns will be affected while the + // current columns is being processed. We can store errors of only a few + // columns in the shared memory. Using a ring buffer will further save the + // cost while iterating to next column. + // + int ring_buffer_rows = height + PL_EDF_MAX_DY; + int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1; + ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) { + .type = PL_VAR_UINT, + .name = "ring_buffer_size", + .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns }, + .compile_time = true, + }); + + // Compute shared memory requirements and try enabling compute shader. + size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t); + if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) { + PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or " + "insufficient compute shader memory!"); + return false; + } + + ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->input_tex, + .desc = { + .name = "input_tex", + .type = PL_DESC_SAMPLED_TEX, + }, + }); + + ident_t out_img = sh_desc(sh, (struct pl_shader_desc) { + .binding.object = params->output_tex, + .desc = { + .name = "output_tex", + .type = PL_DESC_STORAGE_IMG, + .access = PL_DESC_ACCESS_WRITEONLY, + }, + }); + + sh->output = PL_SHADER_SIG_NONE; + sh_describef(sh, "error diffusion (%s, %d bits)", + kernel->name, params->new_depth); + + // Defines the ring buffer in shared memory. + GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size); + GLSL("// pl_shader_error_diffusion \n" + // Safeguard against accidental over-execution + "if (gl_WorkGroupID != uvec3(0)) \n" + " return; \n" + // Initialize the ring buffer. + "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n" + " err_rgb8[i] = 0u; \n" + + // Main block loop, add barrier here to have previous block all + // processed before starting the processing of the next. + "for (uint block_id = 0; block_id < "$"; block_id++) { \n" + "barrier(); \n" + // Compute the coordinate of the pixel we are currently processing, + // both before and after the shift mapping. + "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex; \n" + "const uint height = "$"; \n" + "int y = int(id %% height), x_shifted = int(id / height); \n" + "int x = x_shifted - y * %d; \n" + // Proceed only if we are processing a valid pixel. + "if (x >= 0 && x < "$") { \n" + // The index that the current pixel have on the ring buffer. + "uint idx = uint(x_shifted * "$" + y) %% "$"; \n" + // Fetch the current pixel. + "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0); \n" + "vec3 pix = pix_orig.rgb; \n", + ring_buffer_size, + SH_UINT(blocks), + SH_UINT(height), + kernel->shift, + SH_INT(width), + SH_INT(ring_buffer_rows), + ring_buffer_size, + in_tex); + + // The dithering will quantize pixel value into multiples of 1/dither_quant. + int dither_quant = (1 << params->new_depth) - 1; + + // We encode errors in RGB components into a single 32-bit unsigned integer. + // The error we propagate from the current pixel is in range of + // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the + // sum of all errors been propagated into a pixel is also in the same range. + // It's possible to map errors in this range into [-127, 127], and use an + // unsigned 8-bit integer to store it (using standard two's complement). + // The three 8-bit unsigned integers can then be encoded into a single + // 32-bit unsigned integer, with two 4-bit padding to prevent addition + // operation overflows affecting other component. There are at most 12 + // addition operations on each pixel, so 4-bit padding should be enough. + // The overflow from R component will be discarded. + // + // The following figure is how the encoding looks like. + // + // +------------------------------------+ + // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB| + // +------------------------------------+ + // + + // The bitshift position for R and G component. + const int bitshift_r = 24, bitshift_g = 12; + // The multiplier we use to map [-0.5, 0.5] to [-127, 127]. + const int uint8_mul = 127 * 2; + + GLSL(// Add the error previously propagated into current pixel, and clear + // it in the ring buffer. + "uint err_u32 = err_rgb8[idx] + %uu; \n" + "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128, \n" + " int((err_u32 >> %d) & 0xFFu) - 128, \n" + " int( err_u32 & 0xFFu) - 128) / %d.0; \n" + "err_rgb8[idx] = 0u; \n" + // Write the dithered pixel. + "vec3 dithered = round(pix); \n" + "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a)); \n" + // Prepare for error propagation pass + "vec3 err_divided = (pix - dithered) * %d.0 / %d.0; \n" + "ivec3 tmp; \n", + (128u << bitshift_r) | (128u << bitshift_g) | 128u, + dither_quant, bitshift_r, bitshift_g, uint8_mul, + out_img, dither_quant, + uint8_mul, kernel->divisor); + + // Group error propagation with same weight factor together, in order to + // reduce the number of annoying error encoding. + for (int dividend = 1; dividend <= kernel->divisor; dividend++) { + bool err_assigned = false; + + for (int y = 0; y <= PL_EDF_MAX_DY; y++) { + for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { + if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend) + continue; + + if (!err_assigned) { + err_assigned = true; + + GLSL("tmp = ivec3(round(err_divided * %d.0)); \n" + "err_u32 = (uint(tmp.r & 0xFF) << %d) | \n" + " (uint(tmp.g & 0xFF) << %d) | \n" + " uint(tmp.b & 0xFF); \n", + dividend, + bitshift_r, bitshift_g); + } + + int shifted_x = x + y * kernel->shift; + + // Unlike the right border, errors propagated out from left + // border will remain in the ring buffer. This will produce + // visible artifacts near the left border, especially for + // shift=3 kernels. + if (x < 0) + GLSL("if (x >= %d) \n", -x); + + // Calculate the new position in the ring buffer to propagate + // the error into. + int ring_buffer_delta = shifted_x * ring_buffer_rows + y; + GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n", + ring_buffer_delta, ring_buffer_size); + } + } + } + + GLSL("}} \n"); // end of main loop + valid pixel conditional + return true; +} -- cgit v1.2.3