/* * This file is part of libplacebo. * * libplacebo is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * libplacebo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with libplacebo. If not, see . */ #include #include "shaders.h" #include const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS }; struct sh_dither_obj { pl_shader_obj lut; }; static void sh_dither_uninit(pl_gpu gpu, void *ptr) { struct sh_dither_obj *obj = ptr; pl_shader_obj_destroy(&obj->lut); *obj = (struct sh_dither_obj) {0}; } static void fill_dither_matrix(void *data, const struct sh_lut_params *params) { pl_assert(params->width > 0 && params->height > 0 && params->comps == 1); const struct pl_dither_params *dpar = params->priv; switch (dpar->method) { case PL_DITHER_ORDERED_LUT: pl_assert(params->width == params->height); pl_generate_bayer_matrix(data, params->width); return; case PL_DITHER_BLUE_NOISE: pl_assert(params->width == params->height); pl_generate_blue_noise(data, params->width); return; case PL_DITHER_ORDERED_FIXED: case PL_DITHER_WHITE_NOISE: case PL_DITHER_METHOD_COUNT: return; } pl_unreachable(); } static bool dither_method_is_lut(enum pl_dither_method method) { switch (method) { case PL_DITHER_BLUE_NOISE: case PL_DITHER_ORDERED_LUT: return true; case PL_DITHER_ORDERED_FIXED: case PL_DITHER_WHITE_NOISE: return false; case PL_DITHER_METHOD_COUNT: break; } pl_unreachable(); } static inline float approx_gamma(enum pl_color_transfer trc) { switch (trc) { case PL_COLOR_TRC_UNKNOWN: return 1.0f; case PL_COLOR_TRC_LINEAR: return 1.0f; case PL_COLOR_TRC_PRO_PHOTO:return 1.8f; case PL_COLOR_TRC_GAMMA18: return 1.8f; case PL_COLOR_TRC_GAMMA20: return 2.0f; case PL_COLOR_TRC_GAMMA24: return 2.4f; case PL_COLOR_TRC_GAMMA26: return 2.6f; case PL_COLOR_TRC_ST428: return 2.6f; case PL_COLOR_TRC_GAMMA28: return 2.8f; case PL_COLOR_TRC_SRGB: case PL_COLOR_TRC_BT_1886: case PL_COLOR_TRC_GAMMA22: return 2.2f; case PL_COLOR_TRC_PQ: case PL_COLOR_TRC_HLG: case PL_COLOR_TRC_V_LOG: case PL_COLOR_TRC_S_LOG1: case PL_COLOR_TRC_S_LOG2: return 2.0f; // TODO: handle this better case PL_COLOR_TRC_COUNT: break; } pl_unreachable(); } void pl_shader_dither(pl_shader sh, int new_depth, pl_shader_obj *dither_state, const struct pl_dither_params *params) { if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0)) return; if (new_depth <= 0 || new_depth > 256) { PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth); return; } sh_describef(sh, "dithering (%d bits)", new_depth); GLSL("// pl_shader_dither \n" "{ \n" "float bias; \n"); params = PL_DEF(params, &pl_dither_default_params); if (params->lut_size < 0 || params->lut_size > 8) { SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size); return; } enum pl_dither_method method = params->method; ident_t lut = NULL_IDENT; int lut_size = 0; if (dither_method_is_lut(method)) { if (!dither_state) { PL_WARN(sh, "LUT-based dither method specified but no dither state " "object given, falling back to non-LUT based methods."); goto fallback; } struct sh_dither_obj *obj; obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER, struct sh_dither_obj, sh_dither_uninit); if (!obj) goto fallback; bool cache = method == PL_DITHER_BLUE_NOISE; lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size); lut = sh_lut(sh, sh_lut_params( .object = &obj->lut, .var_type = PL_VAR_FLOAT, .width = lut_size, .height = lut_size, .comps = 1, .fill = fill_dither_matrix, .signature = (CACHE_KEY_DITHER ^ method) * lut_size, .cache = cache ? SH_CACHE(sh) : NULL, .priv = (void *) params, )); if (!lut) goto fallback; } goto done; fallback: method = PL_DITHER_ORDERED_FIXED; // fall through done: ; int size = 0; if (lut) { size = lut_size; } else if (method == PL_DITHER_ORDERED_FIXED) { size = 16; // hard-coded size } if (size) { // Transform the screen position to the cyclic range [0,1) GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size)); if (params->temporal) { int phase = SH_PARAMS(sh).index % 8; float r = phase * (M_PI / 2); // rotate float m = phase < 4 ? 1 : -1; // mirror float mat[2][2] = { {cos(r), -sin(r) }, {sin(r) * m, cos(r) * m}, }; ident_t rot = sh_var(sh, (struct pl_shader_var) { .var = pl_var_mat2("dither_rot"), .data = &mat[0][0], .dynamic = true, }); GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot); } } switch (method) { case PL_DITHER_WHITE_NOISE: { ident_t prng = sh_prng(sh, params->temporal, NULL); GLSL("bias = "$".x;\n", prng); break; } case PL_DITHER_ORDERED_FIXED: // Bitwise ordered dither using only 32-bit uints GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u; \n" // Bitwise merge (morton number) "xy.x = xy.x ^ xy.y; \n" "xy = (xy | xy << 2) & uvec2(0x33333333); \n" "xy = (xy | xy << 1) & uvec2(0x55555555); \n" // Bitwise inversion "uint b = xy.x + (xy.y << 1); \n" "b = (b * 0x0802u & 0x22110u) | \n" " (b * 0x8020u & 0x88440u); \n" "b = 0x10101u * b; \n" "b = (b >> 16) & 0xFFu; \n" // Generate bias value "bias = float(b) * 1.0/256.0; \n"); break; case PL_DITHER_BLUE_NOISE: case PL_DITHER_ORDERED_LUT: pl_assert(lut); GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size)); break; case PL_DITHER_METHOD_COUNT: pl_unreachable(); } // Scale factor for dither rounding GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1); const float gamma = approx_gamma(params->transfer); if (gamma != 1.0f && new_depth <= 4) { GLSL("const float gamma = "$"; \n" "vec4 color_lin = pow(color, vec4(gamma)); \n", SH_FLOAT(gamma)); if (new_depth == 1) { // Special case for bit depth 1 dithering, in this case we can just // ignore the low/high rounding because we know we are always // dithering between 0.0 and 1.0. GLSL("const vec4 low = vec4(0.0); \n" "const vec4 high = vec4(1.0); \n" "vec4 offset = color_lin; \n"); } else { // Linearize the low, high and current color values GLSL("vec4 low = floor(color * scale) / scale; \n" "vec4 high = ceil(color * scale) / scale; \n" "vec4 low_lin = pow(low, vec4(gamma)); \n" "vec4 high_lin = pow(high, vec4(gamma)); \n" "vec4 range = high_lin - low_lin; \n" "vec4 offset = (color_lin - low_lin) / \n" " max(range, 1e-6); \n"); } // Mix in the correct ratio corresponding to the offset and bias GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n"); } else { // Approximate each gamma segment as a straight line, this simplifies // the process of dithering down to a single scale and (biased) round. GLSL("color = scale * color + vec4(bias); \n" "color = floor(color) * (1.0 / scale); \n"); } GLSL("} \n"); } /* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin * * mpv is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * mpv is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with mpv. If not, see . */ // After a (y, x) -> (y, x + y * shift) mapping, find the right most column that // will be affected by the current column. static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k) { int ret = 0; for (int y = 0; y <= PL_EDF_MAX_DY; y++) { for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) { int shifted_x = x + y * k->shift; // The shift mapping guarantees current column (or left of it) // won't be affected by error diffusion. assert(shifted_x > 0); ret = PL_MAX(ret, shifted_x); } } } return ret; } size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel, int height) { // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors // propagated out from bottom side. int rows = height + PL_EDF_MAX_DY; int shifted_columns = compute_rightmost_shifted_column(kernel) + 1; // The shared memory is an array of size rows*shifted_columns. Each element // is a single uint for three RGB component. return rows * shifted_columns * sizeof(uint32_t); } bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params) { const int width = params->input_tex->params.w, height = params->input_tex->params.h; const struct pl_glsl_version glsl = sh_glsl(sh); const struct pl_error_diffusion_kernel *kernel = PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite); pl_assert(params->output_tex->params.w == width); pl_assert(params->output_tex->params.h == height); if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height)) return false; if (params->new_depth <= 0 || params->new_depth > 256) { PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth); return false; } // The parallel error diffusion works by applying the shift mapping first. // Taking the Floyd and Steinberg algorithm for example. After applying // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are // propagated into the next few columns, which makes parallel processing on // the same column possible. // // X 7/16 X 7/16 // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16 // Figuring out the size of rectangle containing all shifted pixels. // The rectangle height is not changed. int shifted_width = width + (height - 1) * kernel->shift; // We process all pixels from the shifted rectangles column by column, with // a single global work group of size |block_size|. // Figuring out how many block are required to process all pixels. We need // this explicitly to make the number of barrier() calls match. int block_size = PL_MIN(glsl.max_group_threads, height); int blocks = PL_DIV_UP(height * shifted_width, block_size); // If we figure out how many of the next columns will be affected while the // current columns is being processed. We can store errors of only a few // columns in the shared memory. Using a ring buffer will further save the // cost while iterating to next column. // int ring_buffer_rows = height + PL_EDF_MAX_DY; int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1; ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) { .type = PL_VAR_UINT, .name = "ring_buffer_size", .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns }, .compile_time = true, }); // Compute shared memory requirements and try enabling compute shader. size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t); if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) { PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or " "insufficient compute shader memory!"); return false; } ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->input_tex, .desc = { .name = "input_tex", .type = PL_DESC_SAMPLED_TEX, }, }); ident_t out_img = sh_desc(sh, (struct pl_shader_desc) { .binding.object = params->output_tex, .desc = { .name = "output_tex", .type = PL_DESC_STORAGE_IMG, .access = PL_DESC_ACCESS_WRITEONLY, }, }); sh->output = PL_SHADER_SIG_NONE; sh_describef(sh, "error diffusion (%s, %d bits)", kernel->name, params->new_depth); // Defines the ring buffer in shared memory. GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size); GLSL("// pl_shader_error_diffusion \n" // Safeguard against accidental over-execution "if (gl_WorkGroupID != uvec3(0)) \n" " return; \n" // Initialize the ring buffer. "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n" " err_rgb8[i] = 0u; \n" // Main block loop, add barrier here to have previous block all // processed before starting the processing of the next. "for (uint block_id = 0; block_id < "$"; block_id++) { \n" "barrier(); \n" // Compute the coordinate of the pixel we are currently processing, // both before and after the shift mapping. "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex; \n" "const uint height = "$"; \n" "int y = int(id %% height), x_shifted = int(id / height); \n" "int x = x_shifted - y * %d; \n" // Proceed only if we are processing a valid pixel. "if (x >= 0 && x < "$") { \n" // The index that the current pixel have on the ring buffer. "uint idx = uint(x_shifted * "$" + y) %% "$"; \n" // Fetch the current pixel. "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0); \n" "vec3 pix = pix_orig.rgb; \n", ring_buffer_size, SH_UINT(blocks), SH_UINT(height), kernel->shift, SH_INT(width), SH_INT(ring_buffer_rows), ring_buffer_size, in_tex); // The dithering will quantize pixel value into multiples of 1/dither_quant. int dither_quant = (1 << params->new_depth) - 1; // We encode errors in RGB components into a single 32-bit unsigned integer. // The error we propagate from the current pixel is in range of // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the // sum of all errors been propagated into a pixel is also in the same range. // It's possible to map errors in this range into [-127, 127], and use an // unsigned 8-bit integer to store it (using standard two's complement). // The three 8-bit unsigned integers can then be encoded into a single // 32-bit unsigned integer, with two 4-bit padding to prevent addition // operation overflows affecting other component. There are at most 12 // addition operations on each pixel, so 4-bit padding should be enough. // The overflow from R component will be discarded. // // The following figure is how the encoding looks like. // // +------------------------------------+ // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB| // +------------------------------------+ // // The bitshift position for R and G component. const int bitshift_r = 24, bitshift_g = 12; // The multiplier we use to map [-0.5, 0.5] to [-127, 127]. const int uint8_mul = 127 * 2; GLSL(// Add the error previously propagated into current pixel, and clear // it in the ring buffer. "uint err_u32 = err_rgb8[idx] + %uu; \n" "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128, \n" " int((err_u32 >> %d) & 0xFFu) - 128, \n" " int( err_u32 & 0xFFu) - 128) / %d.0; \n" "err_rgb8[idx] = 0u; \n" // Write the dithered pixel. "vec3 dithered = round(pix); \n" "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a)); \n" // Prepare for error propagation pass "vec3 err_divided = (pix - dithered) * %d.0 / %d.0; \n" "ivec3 tmp; \n", (128u << bitshift_r) | (128u << bitshift_g) | 128u, dither_quant, bitshift_r, bitshift_g, uint8_mul, out_img, dither_quant, uint8_mul, kernel->divisor); // Group error propagation with same weight factor together, in order to // reduce the number of annoying error encoding. for (int dividend = 1; dividend <= kernel->divisor; dividend++) { bool err_assigned = false; for (int y = 0; y <= PL_EDF_MAX_DY; y++) { for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) { if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend) continue; if (!err_assigned) { err_assigned = true; GLSL("tmp = ivec3(round(err_divided * %d.0)); \n" "err_u32 = (uint(tmp.r & 0xFF) << %d) | \n" " (uint(tmp.g & 0xFF) << %d) | \n" " uint(tmp.b & 0xFF); \n", dividend, bitshift_r, bitshift_g); } int shifted_x = x + y * kernel->shift; // Unlike the right border, errors propagated out from left // border will remain in the ring buffer. This will produce // visible artifacts near the left border, especially for // shift=3 kernels. if (x < 0) GLSL("if (x >= %d) \n", -x); // Calculate the new position in the ring buffer to propagate // the error into. int ring_buffer_delta = shifted_x * ring_buffer_rows + y; GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n", ring_buffer_delta, ring_buffer_size); } } } GLSL("}} \n"); // end of main loop + valid pixel conditional return true; }