summaryrefslogtreecommitdiffstats
path: root/src/shaders/sampling.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/shaders/sampling.c')
-rw-r--r--src/shaders/sampling.c1198
1 files changed, 1198 insertions, 0 deletions
diff --git a/src/shaders/sampling.c b/src/shaders/sampling.c
new file mode 100644
index 0000000..fc10f80
--- /dev/null
+++ b/src/shaders/sampling.c
@@ -0,0 +1,1198 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders/sampling.h>
+
+const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS };
+
+static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
+{
+ if (src->tex)
+ return src->tex->params;
+
+ return (struct pl_tex_params) {
+ .w = src->tex_w,
+ .h = src->tex_h,
+ };
+}
+
+enum filter {
+ NEAREST = PL_TEX_SAMPLE_NEAREST,
+ LINEAR = PL_TEX_SAMPLE_LINEAR,
+ BEST,
+ FASTEST,
+};
+
+// Helper function to compute the src/dst sizes and upscaling ratios
+static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
+ ident_t *src_tex, ident_t *pos, ident_t *pt,
+ float *ratio_x, float *ratio_y, uint8_t *comp_mask,
+ float *scale, bool resizeable,
+ enum filter filter)
+{
+ enum pl_shader_sig sig;
+ float src_w, src_h;
+ enum pl_tex_sample_mode sample_mode;
+ if (src->tex) {
+ pl_fmt fmt = src->tex->params.format;
+ bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
+ pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
+ sig = PL_SHADER_SIG_NONE;
+ src_w = pl_rect_w(src->rect);
+ src_h = pl_rect_h(src->rect);
+ switch (filter) {
+ case FASTEST:
+ case NEAREST:
+ sample_mode = PL_TEX_SAMPLE_NEAREST;
+ break;
+ case LINEAR:
+ if (!can_linear) {
+ SH_FAIL(sh, "Trying to use a shader that requires linear "
+ "sampling with a texture whose format (%s) does not "
+ "support PL_FMT_CAP_LINEAR", fmt->name);
+ return false;
+ }
+ sample_mode = PL_TEX_SAMPLE_LINEAR;
+ break;
+ case BEST:
+ sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
+ break;
+ }
+ } else {
+ pl_assert(src->tex_w && src->tex_h);
+ sig = PL_SHADER_SIG_SAMPLER;
+ src_w = src->sampled_w;
+ src_h = src->sampled_h;
+ if (filter == BEST || filter == FASTEST) {
+ sample_mode = src->mode;
+ } else {
+ sample_mode = (enum pl_tex_sample_mode) filter;
+ if (sample_mode != src->mode) {
+ SH_FAIL(sh, "Trying to use a shader that requires a different "
+ "filter mode than the external sampler.");
+ return false;
+ }
+ }
+ }
+
+ src_w = PL_DEF(src_w, src_params(src).w);
+ src_h = PL_DEF(src_h, src_params(src).h);
+ pl_assert(src_w && src_h);
+
+ int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
+ int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
+ pl_assert(out_w && out_h);
+
+ if (ratio_x)
+ *ratio_x = out_w / fabs(src_w);
+ if (ratio_y)
+ *ratio_y = out_h / fabs(src_h);
+ if (scale)
+ *scale = PL_DEF(src->scale, 1.0);
+
+ if (comp_mask) {
+ uint8_t tex_mask = 0x0Fu;
+ if (src->tex) {
+ // Mask containing only the number of components in the texture
+ tex_mask = (1 << src->tex->params.format->num_components) - 1;
+ }
+
+ uint8_t src_mask = src->component_mask;
+ if (!src_mask)
+ src_mask = (1 << PL_DEF(src->components, 4)) - 1;
+
+ // Only actually sample components that are both requested and
+ // available in the texture being sampled
+ *comp_mask = tex_mask & src_mask;
+ }
+
+ if (resizeable)
+ out_w = out_h = 0;
+ if (!sh_require(sh, sig, out_w, out_h))
+ return false;
+
+ if (src->tex) {
+ pl_rect2df rect = {
+ .x0 = src->rect.x0,
+ .y0 = src->rect.y0,
+ .x1 = src->rect.x0 + src_w,
+ .y1 = src->rect.y0 + src_h,
+ };
+
+ *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
+ "src_tex", &rect, pos, pt);
+ } else {
+ if (pt) {
+ float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
+ if (src->sampler == PL_SAMPLER_RECT)
+ sx = sy = 1.0;
+
+ *pt = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tex_pt"),
+ .data = &(float[2]) { sx, sy },
+ });
+ }
+
+ sh->sampler_type = src->sampler;
+
+ pl_assert(src->format);
+ switch (src->format) {
+ case PL_FMT_UNKNOWN:
+ case PL_FMT_FLOAT:
+ case PL_FMT_UNORM:
+ case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
+ case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
+ case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
+ case PL_FMT_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ *src_tex = sh_fresh(sh, "src_tex");
+ *pos = sh_fresh(sh, "pos");
+
+ GLSLH("#define "$" src_tex \n"
+ "#define "$" pos \n",
+ *src_tex, *pos);
+ }
+
+ return true;
+}
+
+void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_deband_params *params)
+{
+ float scale;
+ ident_t tex, pos, pt;
+ uint8_t mask;
+ if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, LINEAR))
+ return;
+
+ params = PL_DEF(params, &pl_deband_default_params);
+ sh_describe(sh, "debanding");
+ GLSL("vec4 color; \n"
+ "// pl_shader_deband \n"
+ "{ \n"
+ "vec2 pos = "$", pt = "$"; \n"
+ "color = textureLod("$", pos, 0.0);\n",
+ pos, pt, tex);
+
+ mask &= ~0x8u; // ignore alpha channel
+ uint8_t num_comps = sh_num_comps(mask);
+ const char *swiz = sh_swizzle(mask);
+ pl_assert(num_comps <= 3);
+ if (!num_comps) {
+ GLSL("color *= "$"; \n"
+ "} \n",
+ SH_FLOAT(scale));
+ return;
+ }
+
+ GLSL("#define GET(X, Y) \\\n"
+ " (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s) \n"
+ "#define T %s \n",
+ tex, swiz, sh_float_type(mask));
+
+ ident_t prng = sh_prng(sh, true, NULL);
+ GLSL("T avg, diff, bound; \n"
+ "T res = color.%s; \n"
+ "vec2 d; \n",
+ swiz);
+
+ if (params->iterations > 0) {
+ ident_t radius = sh_const_float(sh, "radius", params->radius);
+ ident_t threshold = sh_const_float(sh, "threshold",
+ params->threshold / (1000 * scale));
+
+ // For each iteration, compute the average at a given distance and
+ // pick it instead of the color if the difference is below the threshold.
+ for (int i = 1; i <= params->iterations; i++) {
+ GLSL(// Compute a random angle and distance
+ "d = "$".xy * vec2(%d.0 * "$", %f); \n"
+ "d = d.x * vec2(cos(d.y), sin(d.y)); \n"
+ // Sample at quarter-turn intervals around the source pixel
+ "avg = T(0.0); \n"
+ "avg += GET(+d.x, +d.y); \n"
+ "avg += GET(-d.x, +d.y); \n"
+ "avg += GET(-d.x, -d.y); \n"
+ "avg += GET(+d.x, -d.y); \n"
+ "avg *= 0.25; \n"
+ // Compare the (normalized) average against the pixel
+ "diff = abs(res - avg); \n"
+ "bound = T("$" / %d.0); \n",
+ prng, i, radius, M_PI * 2,
+ threshold, i);
+
+ if (num_comps > 1) {
+ GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
+ } else {
+ GLSL("res = mix(avg, res, diff > bound); \n");
+ }
+ }
+ }
+
+ // Add some random noise to smooth out residual differences
+ if (params->grain > 0) {
+ // Avoid adding grain near true black
+ GLSL("bound = T(\n");
+ for (int c = 0; c < num_comps; c++) {
+ GLSL("%c"$, c > 0 ? ',' : ' ',
+ SH_FLOAT(params->grain_neutral[c] / scale));
+ }
+ GLSL("); \n"
+ "T strength = min(abs(res - bound), "$"); \n"
+ "res += strength * (T("$") - T(0.5)); \n",
+ SH_FLOAT(params->grain / (1000.0 * scale)), prng);
+ }
+
+ GLSL("color.%s = res; \n"
+ "color *= "$"; \n"
+ "#undef T \n"
+ "#undef GET \n"
+ "} \n",
+ swiz, SH_FLOAT(scale));
+}
+
+bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST))
+ return false;
+
+ GLSL("// pl_shader_sample_direct \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, NEAREST))
+ return false;
+
+ sh_describe(sh, "nearest");
+ GLSL("// pl_shader_sample_nearest \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR))
+ return false;
+
+ sh_describe(sh, "bilinear");
+ GLSL("// pl_shader_sample_bilinear \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ // Explanation of how bicubic scaling with only 4 texel fetches is done:
+ // http://www.mate.tue.nl/mate/pdfs/10318.pdf
+ // 'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+
+ sh_describe(sh, "bicubic");
+#pragma GLSL /* pl_shader_sample_bicubic */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ vec2 frac2 = frac * frac; \
+ vec2 inv = vec2(1.0) - frac; \
+ vec2 inv2 = inv * inv; \
+ /* compute filter weights directly */ \
+ vec2 w0 = 1.0/6.0 * inv2 * inv; \
+ vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+ vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \
+ vec2 w3 = 1.0/6.0 * frac2 * frac; \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g + inv.xyxy; \
+ h.xy -= vec2(2.0); \
+ /* sample four corners, then interpolate */ \
+ vec4 p = pos.xyxy + $pt.xyxy * h; \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = ${float:scale} * mix(c1, c0, g.x); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast hermite sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ sh_describe(sh, "hermite");
+#pragma GLSL /* pl_shader_sample_hermite */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ pos += $pt * (smoothstep(0.0, 1.0, frac) - frac); \
+ color = ${float:scale} * textureLod($tex, pos, 0.0); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ sh_describe(sh, "gaussian");
+#pragma GLSL /* pl_shader_sample_gaussian */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 off = -fract(pos * size + vec2(0.5)); \
+ vec2 off2 = -2.0 * off * off; \
+ /* compute gaussian weights */ \
+ vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0)); \
+ vec2 w1 = exp(off2); \
+ vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0)); \
+ vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0)); \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g; \
+ h.xy -= vec2(1.0); \
+ h.zw += vec2(1.0); \
+ g.xy /= g.xy + g.zw; /* explicitly normalize */ \
+ /* sample four corners, then interpolate */ \
+ vec4 p = pos.xyxy + $pt.xyxy * (h + off.xyxy); \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = ${float:scale} * mix(c1, c0, g.x); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
+ float threshold)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ threshold = PL_CLAMP(threshold, 0.0f, 0.5f);
+ sh_describe(sh, "oversample");
+ #pragma GLSL /* pl_shader_sample_oversample */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ /* Round the position to the nearest pixel */ \
+ vec2 fcoord = fract(pos * size - vec2(0.5)); \
+ float rx = ${dynamic float:rx}; \
+ float ry = ${dynamic float:ry}; \
+ vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry); \
+ coeff = clamp(coeff + vec2(0.5), 0.0, 1.0); \
+ @if (threshold > 0) { \
+ float thresh = ${float:threshold}; \
+ coeff = mix(coeff, vec2(0.0), \
+ lessThan(coeff, vec2(thresh))); \
+ coeff = mix(coeff, vec2(1.0), \
+ greaterThan(coeff, vec2(1.0 - thresh))); \
+ @} \
+ \
+ /* Compute the right output blend of colors */ \
+ pos += (coeff - fcoord) * $pt; \
+ color = ${float:scale} * textureLod($tex, pos, 0.0); \
+ }
+
+ return true;
+}
+
+static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg,
+ const char *stage, float rx, float ry)
+{
+ const char *dir;
+ if (rx > 1 && ry > 1) {
+ dir = "up";
+ } else if (rx < 1 && ry < 1) {
+ dir = "down";
+ } else if (rx == 1 && ry == 1) {
+ dir = "noop";
+ } else {
+ dir = "ana";
+ }
+
+ if (cfg->name) {
+ sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name);
+ } else if (cfg->window) {
+ sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir,
+ PL_DEF(cfg->kernel->name, "unknown"),
+ PL_DEF(cfg->window->name, "unknown"));
+ } else {
+ sh_describef(sh, "%s %sscaling (%s)", stage, dir,
+ PL_DEF(cfg->kernel->name, "unknown"));
+ }
+}
+
+// Subroutine for computing and adding an individual texel contribution
+// If `in` is NULL, samples directly
+// If `in` is set, takes the pixel from inX[idx] where X is the component,
+// `in` is the given identifier, and `idx` must be defined by the caller
+static void polar_sample(pl_shader sh, pl_filter filter,
+ ident_t tex, ident_t lut, ident_t radius,
+ int x, int y, uint8_t comp_mask, ident_t in,
+ bool use_ar, ident_t scale)
+{
+ // Since we can't know the subpixel position in advance, assume a
+ // worst case scenario
+ int yy = y > 0 ? y-1 : y;
+ int xx = x > 0 ? x-1 : x;
+ float dmin = sqrt(xx*xx + yy*yy);
+ // Skip samples definitely outside the radius
+ if (dmin >= filter->radius)
+ return;
+
+ // Check for samples that might be skippable
+ bool maybe_skippable = dmin >= filter->radius - M_SQRT2;
+
+ // Check for samples that definitely won't contribute to anti-ringing
+ const float ar_radius = filter->radius_zero;
+ use_ar &= dmin < ar_radius;
+
+#pragma GLSL \
+ offset = ivec2(${const int: x}, ${const int: y}); \
+ d = length(vec2(offset) - fcoord); \
+ @if (maybe_skippable) \
+ if (d < $radius) { \
+ w = $lut(d * 1.0 / $radius); \
+ wsum += w; \
+ @if (in != NULL_IDENT) { \
+ @for (c : comp_mask) \
+ c[@c] = ${in}_@c[idx]; \
+ @} else { \
+ c = textureLod($tex, base + pt * vec2(offset), 0.0); \
+ @} \
+ @for (c : comp_mask) \
+ color[@c] += w * c[@c]; \
+ @if (use_ar) { \
+ if (d <= ${const float: ar_radius}) { \
+ @for (c : comp_mask) { \
+ cc = vec2($scale * c[@c]); \
+ cc.x = 1.0 - cc.x; \
+ ww = cc + vec2(0.10); \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = w * ww; \
+ ar@c += ww * cc; \
+ wwsum@c += ww; \
+ @} \
+ } \
+ @} \
+ @if (maybe_skippable) \
+ }
+}
+
+struct sh_sampler_obj {
+ pl_filter filter;
+ pl_shader_obj lut;
+ pl_shader_obj pass2; // for pl_shader_sample_ortho
+};
+
+#define SCALER_LUT_SIZE 256
+#define SCALER_LUT_CUTOFF 1e-3f
+
+static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_sampler_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->lut);
+ pl_shader_obj_destroy(&obj->pass2);
+ pl_filter_free(&obj->filter);
+ *obj = (struct sh_sampler_obj) {0};
+}
+
+static void fill_polar_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct sh_sampler_obj *obj = params->priv;
+ pl_filter filt = obj->filter;
+
+ pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
+ memcpy(data, filt->weights, params->width * sizeof(float));
+}
+
+bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params)
+{
+ pl_assert(params);
+ if (!params->filter.polar) {
+ SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
+ return false;
+ }
+
+ uint8_t cmask;
+ float rx, ry, scalef;
+ ident_t src_tex, pos, pt, scale;
+ if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST))
+ return false;
+
+ struct sh_sampler_obj *obj;
+ obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
+ sh_sampler_uninit);
+ if (!obj)
+ return false;
+
+ float inv_scale = 1.0 / PL_MIN(rx, ry);
+ inv_scale = PL_MAX(inv_scale, 1.0);
+ if (params->no_widening)
+ inv_scale = 1.0;
+ scale = sh_const_float(sh, "scale", scalef);
+
+ struct pl_filter_config cfg = params->filter;
+ cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+ cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+ bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+ if (update) {
+ pl_filter_free(&obj->filter);
+ obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+ .config = cfg,
+ .lut_entries = SCALER_LUT_SIZE,
+ .cutoff = SCALER_LUT_CUTOFF,
+ ));
+
+ if (!obj->filter) {
+ // This should never happen, but just in case ..
+ SH_FAIL(sh, "Failed initializing polar filter!");
+ return false;
+ }
+ }
+
+ describe_filter(sh, &cfg, "polar", rx, ry);
+ GLSL("// pl_shader_sample_polar \n"
+ "vec4 color = vec4(0.0); \n"
+ "{ \n"
+ "vec2 pos = "$", pt = "$"; \n"
+ "vec2 size = vec2(textureSize("$", 0)); \n"
+ "vec2 fcoord = fract(pos * size - vec2(0.5)); \n"
+ "vec2 base = pos - pt * fcoord; \n"
+ "vec2 center = base + pt * vec2(0.5); \n"
+ "ivec2 offset; \n"
+ "float w, d, wsum = 0.0; \n"
+ "int idx; \n"
+ "vec4 c; \n",
+ pos, pt, src_tex);
+
+ bool use_ar = cfg.antiring > 0;
+ if (use_ar) {
+#pragma GLSL \
+ vec2 ww, cc; \
+ @for (c : cmask) \
+ vec2 ar@c = vec2(0.0), wwsum@c = vec2(0.0);
+ }
+
+ int bound = ceil(obj->filter->radius);
+ int offset = bound - 1; // padding top/left
+ int padding = offset + bound; // total padding
+
+ // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
+ // good tradeoff for the horizontal work group size. Apart from that,
+ // just use as many threads as possible.
+ const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
+
+ // We need to sample everything from base_min to base_max, so make sure we
+ // have enough room in shmem. The extra margin on the ceilf guards against
+ // floating point inaccuracy on near-integer scaling ratios.
+ const float margin = 1e-5;
+ int iw = (int) ceilf(bw / rx - margin) + padding + 1,
+ ih = (int) ceilf(bh / ry - margin) + padding + 1;
+ int sizew = iw, sizeh = ih;
+
+ pl_gpu gpu = SH_GPU(sh);
+ bool dynamic_size = SH_PARAMS(sh).dynamic_constants ||
+ !gpu || !gpu->limits.array_size_constants;
+ if (dynamic_size) {
+ // Overallocate the array slightly to reduce recompilation overhead
+ sizew = PL_ALIGN2(sizew, 8);
+ sizeh = PL_ALIGN2(sizeh, 8);
+ }
+
+ int num_comps = __builtin_popcount(cmask);
+ int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float);
+ bool is_compute = !params->no_compute && sh_glsl(sh).compute &&
+ sh_try_compute(sh, bw, bh, false, shmem_req);
+
+ // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by
+ // much, and it's catastrophically slow on other platforms.
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .lut_type = SH_LUT_TEXTURE,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = SCALER_LUT_SIZE,
+ .comps = 1,
+ .update = update,
+ .fill = fill_polar_lut,
+ .priv = obj,
+ ));
+
+ if (!lut) {
+ SH_FAIL(sh, "Failed initializing polar LUT!");
+ return false;
+ }
+
+ ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
+ ident_t in = sh_fresh(sh, "in");
+
+ if (is_compute) {
+
+ // Compute shader kernel
+ GLSL("uvec2 base_id = uvec2(0u); \n");
+ if (src->rect.x0 > src->rect.x1)
+ GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n");
+ if (src->rect.y0 > src->rect.y1)
+ GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n");
+
+ GLSLH("shared vec2 "$"_base; \n", in);
+ GLSL("if (gl_LocalInvocationID.xy == base_id) \n"
+ " "$"_base = base; \n"
+ "barrier(); \n"
+ "ivec2 rel = ivec2(round((base - "$"_base) * size)); \n",
+ in, in);
+
+ ident_t sizew_c = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .compile_time = true,
+ .name = "sizew",
+ .data = &sizew,
+ });
+
+ ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .compile_time = true,
+ .name = "sizeh",
+ .data = &sizeh,
+ });
+
+ ident_t iw_c = sizew_c, ih_c = sizeh_c;
+ if (dynamic_size) {
+ iw_c = sh_const_int(sh, "iw", iw);
+ ih_c = sh_const_int(sh, "ih", ih);
+ }
+
+ // Load all relevant texels into shmem
+ GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) { \n"
+ "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) { \n"
+ "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0); \n",
+ ih_c, bh, iw_c, bw, src_tex, in, offset, offset);
+
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c);
+ GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c);
+ comps &= ~(1 << c);
+ }
+
+ GLSL("}} \n"
+ "barrier(); \n");
+
+ // Dispatch the actual samples
+ for (int y = 1 - bound; y <= bound; y++) {
+ for (int x = 1 - bound; x <= bound; x++) {
+ GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n",
+ sizew_c, sizew_c, y + offset, x + offset);
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x, y, cmask, in, use_ar, scale);
+ }
+ }
+ } else {
+ // Fragment shader sampling
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ GLSL("vec4 "$"_%d; \n", in, c);
+ comps &= ~(1 << c);
+ }
+
+ // For maximum efficiency, we want to use textureGather() if
+ // possible, rather than direct sampling. Since this is not
+ // always possible/sensible, we need to possibly intermix gathering
+ // with regular sampling. This requires keeping track of which
+ // pixels in the next row were already gathered by the previous
+ // row.
+ uint32_t gathered_cur = 0x0, gathered_next = 0x0;
+ const float radius2 = PL_SQUARE(obj->filter->radius);
+ const int base = bound - 1;
+
+ if (base + bound >= 8 * sizeof(gathered_cur)) {
+ SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
+ obj->filter->radius);
+ return false;
+ }
+
+ for (int y = 1 - bound; y <= bound; y++) {
+ for (int x = 1 - bound; x <= bound; x++) {
+ // Skip already gathered texels
+ uint32_t bit = 1llu << (base + x);
+ if (gathered_cur & bit)
+ continue;
+
+ // Using texture gathering is only more efficient than direct
+ // sampling in the case where we expect to be able to use all
+ // four gathered texels, without having to discard any. So
+ // only do it if we suspect it will be a win rather than a
+ // loss.
+ int xx = x*x, xx1 = (x+1)*(x+1);
+ int yy = y*y, yy1 = (y+1)*(y+1);
+ bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
+ use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
+ use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
+ use_gather &= !src->tex || src->tex->params.format->gatherable;
+
+ // Gathering from components other than the R channel requires
+ // support for GLSL 400, which introduces the overload of
+ // textureGather* that allows specifying the component.
+ //
+ // This is also the minimum requirement if we don't know the
+ // texture format capabilities, for the sampler2D interface
+ if (cmask != 0x1 || !src->tex)
+ use_gather &= sh_glsl(sh).version >= 400;
+
+ if (!use_gather) {
+ // Switch to direct sampling instead
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x, y, cmask, NULL_IDENT, use_ar, scale);
+ continue;
+ }
+
+ // Gather the four surrounding texels simultaneously
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ if (x || y) {
+ if (c) {
+ GLSL($"_%d = textureGatherOffset("$", "
+ "center, ivec2(%d, %d), %d); \n",
+ in, c, src_tex, x, y, c);
+ } else {
+ GLSL($"_0 = textureGatherOffset("$", "
+ "center, ivec2(%d, %d)); \n",
+ in, src_tex, x, y);
+ }
+ } else {
+ if (c) {
+ GLSL($"_%d = textureGather("$", center, %d); \n",
+ in, c, src_tex, c);
+ } else {
+ GLSL($"_0 = textureGather("$", center); \n",
+ in, src_tex);
+ }
+ }
+ comps &= ~(1 << c);
+ }
+
+ // Mix in all of the points with their weights
+ for (int p = 0; p < 4; p++) {
+ // The four texels are gathered counterclockwise starting
+ // from the bottom left
+ static const int xo[4] = {0, 1, 1, 0};
+ static const int yo[4] = {1, 1, 0, 0};
+ if (x+xo[p] > bound || y+yo[p] > bound)
+ continue; // next subpixel
+
+ GLSL("idx = %d;\n", p);
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x+xo[p], y+yo[p], cmask, in, use_ar, scale);
+ }
+
+ // Mark the other next row's pixels as already gathered
+ gathered_next |= bit | (bit << 1);
+ x++; // skip adjacent pixel
+ }
+
+ // Prepare for new row
+ gathered_cur = gathered_next;
+ gathered_next = 0;
+ }
+ }
+
+#pragma GLSL \
+ color = $scale / wsum * color; \
+ @if (use_ar) { \
+ @for (c : cmask) { \
+ ww = ar@c / wwsum@c; \
+ ww.x = 1.0 - ww.x; \
+ w = clamp(color[@c], ww.x, ww.y); \
+ w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y); \
+ color[@c] = mix(color[@c], w, ${float:cfg.antiring}); \
+ @} \
+ @} \
+ @if (!(cmask & (1 << PL_CHANNEL_A))) \
+ color.a = 1.0; \
+ }
+
+ return true;
+}
+
+static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct sh_sampler_obj *obj = params->priv;
+ pl_filter filt = obj->filter;
+
+ if (filt->radius == filt->radius_zero) {
+ // Main lobe covers entire radius, so all weights are positive, meaning
+ // we can use the linear resampling trick
+ for (int n = 0; n < SCALER_LUT_SIZE; n++) {
+ const float *weights = filt->weights + n * filt->row_stride;
+ float *row = (float *) data + n * filt->row_stride;
+ pl_assert(filt->row_size % 2 == 0);
+ for (int i = 0; i < filt->row_size; i += 2) {
+ const float w0 = weights[i], w1 = weights[i+1];
+ assert(w0 + w1 >= 0.0f);
+ row[i] = w0 + w1;
+ row[i+1] = w1 / (w0 + w1);
+ }
+ }
+ } else {
+ size_t entries = SCALER_LUT_SIZE * filt->row_stride;
+ pl_assert(params->width * params->height * params->comps == entries);
+ memcpy(data, filt->weights, entries * sizeof(float));
+ }
+}
+
+enum {
+ SEP_VERT = 0,
+ SEP_HORIZ,
+ SEP_PASSES
+};
+
+bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params)
+{
+ pl_assert(params);
+ if (params->filter.polar) {
+ SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
+ return false;
+ }
+
+ pl_gpu gpu = SH_GPU(sh);
+ pl_assert(gpu);
+
+ uint8_t comps;
+ float ratio[SEP_PASSES], scale;
+ ident_t src_tex, pos, pt;
+ if (!setup_src(sh, src, &src_tex, &pos, &pt,
+ &ratio[SEP_HORIZ], &ratio[SEP_VERT],
+ &comps, &scale, false, LINEAR))
+ return false;
+
+
+ int pass;
+ if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) {
+ pass = SEP_VERT;
+ } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) {
+ pass = SEP_HORIZ;
+ } else {
+ SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a "
+ "pl_sample_src that requires scaling in multiple directions "
+ "(rx=%f, ry=%f), this is not possible!",
+ ratio[SEP_HORIZ], ratio[SEP_VERT]);
+ return false;
+ }
+
+ // We can store a separate sampler object per dimension, so dispatch the
+ // right one. This is needed for two reasons:
+ // 1. Anamorphic content can have a different scaling ratio for each
+ // dimension. In particular, you could be upscaling in one and
+ // downscaling in the other.
+ // 2. After fixing the source for `setup_src`, we lose information about
+ // the scaling ratio of the other component. (Although this is only a
+ // minor reason and could easily be changed with some boilerplate)
+ struct sh_sampler_obj *obj;
+ obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
+ struct sh_sampler_obj, sh_sampler_uninit);
+ if (!obj)
+ return false;
+
+ if (pass != 0) {
+ obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
+ struct sh_sampler_obj, sh_sampler_uninit);
+ assert(obj);
+ }
+
+ float inv_scale = 1.0 / ratio[pass];
+ inv_scale = PL_MAX(inv_scale, 1.0);
+ if (params->no_widening)
+ inv_scale = 1.0;
+
+ struct pl_filter_config cfg = params->filter;
+ cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+ cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+ bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+
+ if (update) {
+ pl_filter_free(&obj->filter);
+ obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+ .config = cfg,
+ .lut_entries = SCALER_LUT_SIZE,
+ .max_row_size = gpu->limits.max_tex_2d_dim / 4,
+ .row_stride_align = 4,
+ ));
+
+ if (!obj->filter) {
+ // This should never happen, but just in case ..
+ SH_FAIL(sh, "Failed initializing separated filter!");
+ return false;
+ }
+ }
+
+ int N = obj->filter->row_size; // number of samples to convolve
+ int width = obj->filter->row_stride / 4; // width of the LUT texture
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = width,
+ .height = SCALER_LUT_SIZE,
+ .comps = 4,
+ .update = update,
+ .fill = fill_ortho_lut,
+ .priv = obj,
+ ));
+ if (!lut) {
+ SH_FAIL(sh, "Failed initializing separated LUT!");
+ return false;
+ }
+
+ const int dir[SEP_PASSES][2] = {
+ [SEP_HORIZ] = {1, 0},
+ [SEP_VERT] = {0, 1},
+ };
+
+ static const char *names[SEP_PASSES] = {
+ [SEP_HORIZ] = "ortho (horiz)",
+ [SEP_VERT] = "ortho (vert)",
+ };
+
+ describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]);
+
+ float denom = PL_MAX(1, width - 1); // avoid division by zero
+ bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0;
+ bool use_linear = obj->filter->radius == obj->filter->radius_zero;
+ use_ar &= !use_linear; // filter has no negative weights
+
+#pragma GLSL /* pl_shader_sample_ortho */ \
+ vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \
+ { \
+ vec2 pos = $pos, pt = $pt; \
+ vec2 size = vec2(textureSize($src_tex, 0)); \
+ vec2 dir = vec2(${const float:dir[pass][0]}, ${const float: dir[pass][1]}); \
+ pt *= dir; \
+ vec2 fcoord2 = fract(pos * size - vec2(0.5)); \
+ float fcoord = dot(fcoord2, dir); \
+ vec2 base = pos - fcoord * pt - pt * vec2(${const float: N / 2 - 1}); \
+ vec4 ws; \
+ float off; \
+ ${vecType: comps} c, ca = ${vecType: comps}(0.0); \
+ @if (use_ar) { \
+ ${vecType: comps} hi = ${vecType: comps}(0.0); \
+ ${vecType: comps} lo = ${vecType: comps}(1e9); \
+ @} \
+ @for (n < N) { \
+ @if @(n % 4 == 0) \
+ ws = $lut(vec2(float(@n / 4) / ${const float: denom}, fcoord)); \
+ @if @(vars.use_ar && (n == vars.n / 2 - 1 || n == vars.n / 2)) { \
+ c = textureLod($src_tex, base + pt * @n.0, 0.0).${swizzle: comps}; \
+ ca += ws[@n % 4] * c; \
+ lo = min(lo, c); \
+ hi = max(hi, c); \
+ @} else { \
+ @if (use_linear) { \
+ @if @(n % 2 == 0) { \
+ off = @n.0 + ws[@n % 4 + 1]; \
+ ca += ws[@n % 4] * textureLod($src_tex, base + pt * off, \
+ 0.0).${swizzle: comps}; \
+ @} \
+ @} else { \
+ ca += ws[@n % 4] * textureLod($src_tex, base + pt * @n.0, \
+ 0.0).${swizzle: comps}; \
+ @} \
+ @} \
+ @} \
+ @if (use_ar) \
+ ca = mix(ca, clamp(ca, lo, hi), ${float: cfg.antiring}); \
+ color.${swizzle: comps} = ${float: scale} * ca; \
+ }
+
+ return true;
+}
+
+const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS };
+
+void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h,
+ const struct pl_distort_params *params)
+{
+ pl_assert(params);
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+ return;
+
+ const int src_w = src_tex->params.w, src_h = src_tex->params.h;
+ float rx = 1.0f, ry = 1.0f;
+ if (src_w > src_h) {
+ ry = (float) src_h / src_w;
+ } else {
+ rx = (float) src_w / src_h;
+ }
+
+ // Map from texel coordinates [0,1]² to aspect-normalized representation
+ const pl_transform2x2 tex2norm = {
+ .mat.m = {
+ { 2 * rx, 0 },
+ { 0, -2 * ry },
+ },
+ .c = { -rx, ry },
+ };
+
+ // Map from aspect-normalized representation to canvas coords [-1,1]²
+ const float sx = params->unscaled ? (float) src_w / out_w : 1.0f;
+ const float sy = params->unscaled ? (float) src_h / out_h : 1.0f;
+ const pl_transform2x2 norm2canvas = {
+ .mat.m = {
+ { sx / rx, 0 },
+ { 0, sy / ry },
+ },
+ };
+
+ struct pl_transform2x2 transform = params->transform;
+ pl_transform2x2_mul(&transform, &tex2norm);
+ pl_transform2x2_rmul(&norm2canvas, &transform);
+
+ if (params->constrain) {
+ pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) {
+ .x1 = 1, .y1 = 1,
+ });
+ const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f);
+ pl_transform2x2_scale(&transform, 2.0f / k);
+ };
+
+ // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond
+ // to normal mathematical axis conventions
+ static const pl_rect2df canvas = {
+ .x0 = -1.0f, .x1 = 1.0f,
+ .y0 = 1.0f, .y1 = -1.0f,
+ };
+
+ ident_t pos = sh_attr_vec2(sh, "pos", &canvas);
+ ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode,
+ PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt);
+
+ // Bind the inverse of the tex2canvas transform (i.e. canvas2tex)
+ pl_transform2x2_invert(&transform);
+ ident_t tf = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat2("tf"),
+ .data = PL_TRANSPOSE_2X2(transform.mat.m),
+ });
+
+ ident_t tf_c = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tf_c"),
+ .data = transform.c,
+ });
+
+ // See pl_shader_sample_bicubic
+ sh_describe(sh, "distortion");
+#pragma GLSL /* pl_shader_sample_distort */ \
+ vec4 color; \
+ { \
+ vec2 pos = $tf * $pos + $tf_c; \
+ vec2 pt = $pt; \
+ @if (params->bicubic) { \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ vec2 frac2 = frac * frac; \
+ vec2 inv = vec2(1.0) - frac; \
+ vec2 inv2 = inv * inv; \
+ vec2 w0 = 1.0/6.0 * inv2 * inv; \
+ vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+ vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \
+ vec2 w3 = 1.0/6.0 * frac2 * frac; \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g + inv.xyxy; \
+ h.xy -= vec2(2.0); \
+ vec4 p = pos.xyxy + pt.xyxy * h; \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = mix(c1, c0, g.x); \
+ @} else { \
+ color = texture($tex, pos); \
+ @} \
+ @if (params->alpha_mode) { \
+ vec2 border = min(pos, vec2(1.0) - pos); \
+ border = smoothstep(vec2(0.0), pt, border); \
+ @if (params->alpha_mode == PL_ALPHA_PREMULTIPLIED) \
+ color.rgba *= border.x * border.y; \
+ @else \
+ color.a *= border.x * border.y; \
+ @} \
+ }
+
+}