13 files changed, 9542 insertions, 0 deletions
diff --git a/src/shaders/colorspace.c b/src/shaders/colorspace.c
new file mode 100644
index 0000000..c7b3b5a
--- /dev/null
+++ b/src/shaders/colorspace.c
@@ -0,0 +1,2120 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "cache.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+
+// Common constants for SMPTE ST.2084 (PQ)
+static const float PQ_M1 = 2610./4096 * 1./4,
+                   PQ_M2 = 2523./4096 * 128,
+                   PQ_C1 = 3424./4096,
+                   PQ_C2 = 2413./4096 * 32,
+                   PQ_C3 = 2392./4096 * 32;
+
+// Common constants for ARIB STD-B67 (HLG)
+static const float HLG_A = 0.17883277,
+                   HLG_B = 0.28466892,
+                   HLG_C = 0.55991073,
+                   HLG_REF = 1000.0 / PL_COLOR_SDR_WHITE;
+
+// Common constants for Panasonic V-Log
+static const float VLOG_B = 0.00873,
+                   VLOG_C = 0.241514,
+                   VLOG_D = 0.598206;
+
+// Common constants for Sony S-Log
+static const float SLOG_A = 0.432699,
+                   SLOG_B = 0.037584,
+                   SLOG_C = 0.616596 + 0.03,
+                   SLOG_P = 3.538813,
+                   SLOG_Q = 0.030001,
+                   SLOG_K2 = 155.0 / 219.0;
+
+void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr,
+                         enum pl_alpha_mode mode)
+{
+    if (repr->alpha == PL_ALPHA_PREMULTIPLIED && mode == PL_ALPHA_INDEPENDENT) {
+        GLSL("if (color.a > 1e-6)               \n"
+             "    color.rgb /= vec3(color.a);   \n");
+        repr->alpha = PL_ALPHA_INDEPENDENT;
+    }
+
+    if (repr->alpha == PL_ALPHA_INDEPENDENT && mode == PL_ALPHA_PREMULTIPLIED) {
+        GLSL("color.rgb *= vec3(color.a); \n");
+        repr->alpha = PL_ALPHA_PREMULTIPLIED;
+    }
+}
+
+#ifdef PL_HAVE_DOVI
+static inline void reshape_mmr(pl_shader sh, ident_t mmr, bool single,
+                               int min_order, int max_order)
+{
+    if (single) {
+        GLSL("const uint mmr_idx = 0u; \n");
+    } else {
+        GLSL("uint mmr_idx = uint(coeffs.y); \n");
+    }
+
+    assert(min_order <= max_order);
+    if (min_order < max_order)
+        GLSL("uint order = uint(coeffs.w); \n");
+
+    GLSL("vec4 sigX;                            \n"
+         "s = coeffs.x;                         \n"
+         "sigX.xyz = sig.xxy * sig.yzz;         \n"
+         "sigX.w = sigX.x * sig.z;              \n"
+         "s += dot("$"[mmr_idx + 0].xyz, sig);  \n"
+         "s += dot("$"[mmr_idx + 1], sigX);     \n",
+         mmr, mmr);
+
+    if (max_order >= 2) {
+        if (min_order < 2)
+            GLSL("if (order >= 2) { \n");
+
+        GLSL("vec3 sig2 = sig * sig;                \n"
+             "vec4 sigX2 = sigX * sigX;             \n"
+             "s += dot("$"[mmr_idx + 2].xyz, sig2); \n"
+             "s += dot("$"[mmr_idx + 3], sigX2);    \n",
+             mmr, mmr);
+
+        if (max_order == 3) {
+            if (min_order < 3)
+                GLSL("if (order >= 3 { \n");
+
+            GLSL("s += dot("$"[mmr_idx + 4].xyz, sig2 * sig);   \n"
+                 "s += dot("$"[mmr_idx + 5], sigX2 * sigX);     \n",
+                 mmr, mmr);
+
+            if (min_order < 3)
+                GLSL("} \n");
+        }
+
+        if (min_order < 2)
+            GLSL("} \n");
+    }
+}
+
+static inline void reshape_poly(pl_shader sh)
+{
+    GLSL("s = (coeffs.z * s + coeffs.y) * s + coeffs.x; \n");
+}
+#endif
+
+void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data)
+{
+#ifdef PL_HAVE_DOVI
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0) || !data)
+        return;
+
+    sh_describe(sh, "reshaping");
+    GLSL("// pl_shader_reshape                  \n"
+         "{                                     \n"
+         "vec3 sig;                             \n"
+         "vec4 coeffs;                          \n"
+         "float s;                              \n"
+         "sig = clamp(color.rgb, 0.0, 1.0);     \n");
+
+    float coeffs_data[8][4];
+    float mmr_packed_data[8*6][4];
+
+    for (int c = 0; c < 3; c++) {
+        const struct pl_reshape_data *comp = &data->comp[c];
+        if (!comp->num_pivots)
+            continue;
+
+        pl_assert(comp->num_pivots >= 2 && comp->num_pivots <= 9);
+        GLSL("s = sig[%d]; \n", c);
+
+        // Prepare coefficients for GPU
+        bool has_poly = false, has_mmr = false, mmr_single = true;
+        int mmr_idx = 0, min_order = 3, max_order = 1;
+        memset(coeffs_data, 0, sizeof(coeffs_data));
+        for (int i = 0; i < comp->num_pivots - 1; i++) {
+            switch (comp->method[i]) {
+            case 0: // polynomial
+                has_poly = true;
+                coeffs_data[i][3] = 0.0; // order=0 signals polynomial
+                for (int k = 0; k < 3; k++)
+                    coeffs_data[i][k] = comp->poly_coeffs[i][k];
+                break;
+
+            case 1:
+                min_order = PL_MIN(min_order, comp->mmr_order[i]);
+                max_order = PL_MAX(max_order, comp->mmr_order[i]);
+                mmr_single = !has_mmr;
+                has_mmr = true;
+                coeffs_data[i][3] = (float) comp->mmr_order[i];
+                coeffs_data[i][0] = comp->mmr_constant[i];
+                coeffs_data[i][1] = (float) mmr_idx;
+                for (int j = 0; j < comp->mmr_order[i]; j++) {
+                    // store weights per order as two packed vec4s
+                    float *mmr = &mmr_packed_data[mmr_idx][0];
+                    mmr[0] = comp->mmr_coeffs[i][j][0];
+                    mmr[1] = comp->mmr_coeffs[i][j][1];
+                    mmr[2] = comp->mmr_coeffs[i][j][2];
+                    mmr[3] = 0.0; // unused
+                    mmr[4] = comp->mmr_coeffs[i][j][3];
+                    mmr[5] = comp->mmr_coeffs[i][j][4];
+                    mmr[6] = comp->mmr_coeffs[i][j][5];
+                    mmr[7] = comp->mmr_coeffs[i][j][6];
+                    mmr_idx += 2;
+                }
+                break;
+
+            default:
+                pl_unreachable();
+            }
+        }
+
+        if (comp->num_pivots > 2) {
+
+            // Skip the (irrelevant) lower and upper bounds
+            float pivots_data[7];
+            memcpy(pivots_data, comp->pivots + 1,
+                   (comp->num_pivots - 2) * sizeof(pivots_data[0]));
+
+            // Fill the remainder with a quasi-infinite sentinel pivot
+            for (int i = comp->num_pivots - 2; i < PL_ARRAY_SIZE(pivots_data); i++)
+                pivots_data[i] = 1e9f;
+
+            ident_t pivots = sh_var(sh, (struct pl_shader_var) {
+                .data = pivots_data,
+                .var = {
+                    .name = "pivots",
+                    .type = PL_VAR_FLOAT,
+                    .dim_v = 1,
+                    .dim_m = 1,
+                    .dim_a = PL_ARRAY_SIZE(pivots_data),
+                },
+            });
+
+            ident_t coeffs = sh_var(sh, (struct pl_shader_var) {
+                .data = coeffs_data,
+                .var = {
+                    .name = "coeffs",
+                    .type = PL_VAR_FLOAT,
+                    .dim_v = 4,
+                    .dim_m = 1,
+                    .dim_a = PL_ARRAY_SIZE(coeffs_data),
+                },
+            });
+
+            // Efficiently branch into the correct set of coefficients
+            GLSL("#define test(i) bvec4(s >= "$"[i])                \n"
+                 "#define coef(i) "$"[i]                            \n"
+                 "coeffs = mix(mix(mix(coef(0), coef(1), test(0)),  \n"
+                 "                 mix(coef(2), coef(3), test(2)),  \n"
+                 "                 test(1)),                        \n"
+                 "             mix(mix(coef(4), coef(5), test(4)),  \n"
+                 "                 mix(coef(6), coef(7), test(6)),  \n"
+                 "                 test(5)),                        \n"
+                 "             test(3));                            \n"
+                 "#undef test                                       \n"
+                 "#undef coef                                       \n",
+                 pivots, coeffs);
+
+        } else {
+
+            // No need for a single pivot, just set the coeffs directly
+            GLSL("coeffs = "$"; \n", sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_vec4("coeffs"),
+                .data = coeffs_data,
+            }));
+
+        }
+
+        ident_t mmr = NULL_IDENT;
+        if (has_mmr) {
+            mmr = sh_var(sh, (struct pl_shader_var) {
+                .data = mmr_packed_data,
+                .var = {
+                    .name = "mmr",
+                    .type = PL_VAR_FLOAT,
+                    .dim_v = 4,
+                    .dim_m = 1,
+                    .dim_a = mmr_idx,
+                },
+            });
+        }
+
+        if (has_mmr && has_poly) {
+            GLSL("if (coeffs.w == 0.0) { \n");
+            reshape_poly(sh);
+            GLSL("} else { \n");
+            reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+            GLSL("} \n");
+        } else if (has_poly) {
+            reshape_poly(sh);
+        } else {
+            assert(has_mmr);
+            GLSL("{ \n");
+            reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+            GLSL("} \n");
+        }
+
+        ident_t lo = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("lo"),
+            .data = &comp->pivots[0],
+        });
+        ident_t hi = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("hi"),
+            .data = &comp->pivots[comp->num_pivots - 1],
+        });
+        GLSL("color[%d] = clamp(s, "$", "$"); \n", c, lo, hi);
+    }
+
+    GLSL("} \n");
+#else
+    SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+#endif
+}
+
+void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr,
+                            const struct pl_color_adjustment *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "color decoding");
+    GLSL("// pl_shader_decode_color \n"
+         "{ \n");
+
+    // Do this first because the following operations are potentially nonlinear
+    pl_shader_set_alpha(sh, repr, PL_ALPHA_INDEPENDENT);
+
+    if (repr->sys == PL_COLOR_SYSTEM_XYZ ||
+        repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+    {
+        ident_t scale = SH_FLOAT(pl_color_repr_normalize(repr));
+        GLSL("color.rgb *= vec3("$"); \n", scale);
+    }
+
+    if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+        pl_shader_linearize(sh, &(struct pl_color_space) {
+            .transfer = PL_COLOR_TRC_ST428,
+        });
+    }
+
+    if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+        pl_shader_dovi_reshape(sh, repr->dovi);
+
+    enum pl_color_system orig_sys = repr->sys;
+    pl_transform3x3 tr = pl_color_repr_decode(repr, params);
+
+    if (memcmp(&tr, &pl_transform3x3_identity, sizeof(tr))) {
+        ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_mat3("cmat"),
+            .data = PL_TRANSPOSE_3X3(tr.mat.m),
+        });
+
+        ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_vec3("cmat_c"),
+            .data = tr.c,
+        });
+
+        GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+    }
+
+    switch (orig_sys) {
+    case PL_COLOR_SYSTEM_BT_2020_C:
+        // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
+        // C'bc = (B'-Y'c) / 1.9404  | C'bc <= 0
+        //      = (B'-Y'c) / 1.5816  | C'bc >  0
+        //
+        // C'rc = (R'-Y'c) / 1.7184  | C'rc <= 0
+        //      = (R'-Y'c) / 0.9936  | C'rc >  0
+        //
+        // as per the BT.2020 specification, table 4. This is a non-linear
+        // transformation because (constant) luminance receives non-equal
+        // contributions from the three different channels.
+        GLSL("// constant luminance conversion                              \n"
+             "color.br = color.br * mix(vec2(1.5816, 0.9936),               \n"
+             "                          vec2(1.9404, 1.7184),               \n"
+             "                          lessThanEqual(color.br, vec2(0.0))) \n"
+             "           + color.gg;                                        \n");
+        // Expand channels to camera-linear light. This shader currently just
+        // assumes everything uses the BT.2020 12-bit gamma function, since the
+        // difference between 10 and 12-bit is negligible for anything other
+        // than 12-bit content.
+        GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5),                        \n"
+             "                pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+             "                    vec3(1.0/0.45)),                             \n"
+             "                lessThanEqual(vec3(0.08145), color.rgb));        \n");
+        // Calculate the green channel from the expanded RYcB, and recompress to G'
+        // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
+        GLSL("color.g = (lin.g - 0.2627*lin.r - 0.0593*lin.b)*1.0/0.6780;   \n"
+             "color.g = mix(color.g * 4.5,                                  \n"
+             "              1.0993 * pow(color.g, 0.45) - 0.0993,           \n"
+             "              0.0181 <= color.g);                             \n");
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_PQ:;
+        // Conversion process from the spec:
+        //
+        // 1. L'M'S' = cmat * ICtCp
+        // 2. LMS = linearize(L'M'S')  (EOTF for PQ, inverse OETF for HLG)
+        // 3. RGB = lms2rgb * LMS
+        //
+        // After this we need to invert step 2 to arrive at non-linear RGB.
+        // (It's important we keep the transfer function conversion separate
+        // from the color system decoding, so we have to partially undo our
+        // work here even though we will end up linearizing later on anyway)
+
+        GLSL(// PQ EOTF
+             "color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f));           \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)                    \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb);             \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));                     \n"
+             // LMS matrix
+             "color.rgb = mat3( 3.43661, -0.79133, -0.0259499,              \n"
+             "                 -2.50645,  1.98360, -0.0989137,              \n"
+             "                  0.06984, -0.192271, 1.12486) * color.rgb;   \n"
+             // PQ OETF
+             "color.rgb = pow(max(color.rgb, 0.0), vec3(%f));               \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)                 \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb);            \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                         \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+             PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+        GLSL(// HLG OETF^-1
+             "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,                \n"
+             "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f))        \n"
+             "                    + vec3(%f),                                   \n"
+             "                lessThan(vec3(0.5), color.rgb));                  \n"
+             // LMS matrix
+             "color.rgb = mat3( 3.43661, -0.79133, -0.0259499,                  \n"
+             "                 -2.50645,  1.98360, -0.0989137,                  \n"
+             "                  0.06984, -0.192271, 1.12486) * color.rgb;       \n"
+            // HLG OETF
+             "color.rgb = mix(vec3(0.5) * sqrt(color.rgb),                      \n"
+             "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),  \n"
+             "                lessThan(vec3(1.0), color.rgb));                  \n",
+             HLG_C, HLG_A, HLG_B,
+             HLG_A, HLG_B, HLG_C);
+        break;
+
+    case PL_COLOR_SYSTEM_DOLBYVISION:;
+#ifdef PL_HAVE_DOVI
+        // Dolby Vision always outputs BT.2020-referred HPE LMS, so hard-code
+        // the inverse LMS->RGB matrix corresponding to this color space.
+        pl_matrix3x3 dovi_lms2rgb = {{
+            { 3.06441879, -2.16597676,  0.10155818},
+            {-0.65612108,  1.78554118, -0.12943749},
+            { 0.01736321, -0.04725154,  1.03004253},
+        }};
+
+        pl_matrix3x3_mul(&dovi_lms2rgb, &repr->dovi->linear);
+        ident_t mat = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_mat3("lms2rgb"),
+            .data = PL_TRANSPOSE_3X3(dovi_lms2rgb.m),
+        });
+
+        // PQ EOTF
+        GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f));   \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)            \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb);     \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));             \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1);
+        // LMS matrix
+        GLSL("color.rgb = "$" * color.rgb; \n", mat);
+        // PQ OETF
+        GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(%f));       \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)         \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb);    \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                 \n",
+             PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        break;
+#else
+        SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+        return;
+#endif
+
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+    case PL_COLOR_SYSTEM_XYZ:
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_YCGCO:
+        break; // no special post-processing needed
+
+    case PL_COLOR_SYSTEM_COUNT:
+        pl_unreachable();
+    }
+
+    // Gamma adjustment. Doing this here (in non-linear light) is technically
+    // somewhat wrong, but this is just an aesthetic parameter and not really
+    // meant for colorimetric precision, so we don't care too much.
+    if (params && params->gamma == 0) {
+        // Avoid division by zero
+        GLSL("color.rgb = vec3(0.0); \n");
+    } else if (params && params->gamma != 1) {
+        ident_t gamma = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("gamma"),
+            .data = &(float){ 1 / params->gamma },
+        });
+        GLSL("color.rgb = pow(max(color.rgb, vec3(0.0)), vec3("$")); \n", gamma);
+    }
+
+    GLSL("}\n");
+}
+
+void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "color encoding");
+    GLSL("// pl_shader_encode_color \n"
+         "{ \n");
+
+    switch (repr->sys) {
+    case PL_COLOR_SYSTEM_BT_2020_C:
+        // Expand R'G'B' to RGB
+        GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5),                        \n"
+             "                pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+             "                    vec3(1.0/0.45)),                             \n"
+             "                lessThanEqual(vec3(0.08145), color.rgb));        \n");
+
+        // Compute Yc from RGB and compress to R'Y'cB'
+        GLSL("color.g = dot(vec3(0.2627, 0.6780, 0.0593), lin);     \n"
+             "color.g = mix(color.g * 4.5,                          \n"
+             "              1.0993 * pow(color.g, 0.45) - 0.0993,   \n"
+             "              0.0181 <= color.g);                     \n");
+
+        // Compute C'bc and C'rc into color.br
+        GLSL("color.br = color.br - color.gg;                       \n"
+             "color.br *= mix(vec2(1.0/1.5816, 1.0/0.9936),         \n"
+             "                vec2(1.0/1.9404, 1.0/1.7184),         \n"
+             "                lessThanEqual(color.br, vec2(0.0)));  \n");
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_PQ:;
+        GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f));           \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)                    \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb);             \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));                     \n"
+             "color.rgb = mat3(0.412109, 0.166748, 0.024170,                \n"
+             "                 0.523925, 0.720459, 0.075440,                \n"
+             "                 0.063965, 0.112793, 0.900394) * color.rgb;   \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                         \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)                 \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb);            \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                         \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+             PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+        GLSL("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,                \n"
+             "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f))        \n"
+             "                    + vec3(%f),                                   \n"
+             "                lessThan(vec3(0.5), color.rgb));                  \n"
+             "color.rgb = mat3(0.412109, 0.166748, 0.024170,                    \n"
+             "                 0.523925, 0.720459, 0.075440,                    \n"
+             "                 0.063965, 0.112793, 0.900394) * color.rgb;       \n"
+             "color.rgb = mix(vec3(0.5) * sqrt(color.rgb),                      \n"
+             "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),  \n"
+             "                lessThan(vec3(1.0), color.rgb));                  \n",
+             HLG_C, HLG_A, HLG_B,
+             HLG_A, HLG_B, HLG_C);
+        break;
+
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+        SH_FAIL(sh, "Cannot un-apply dolbyvision yet (no inverse reshaping)!");
+        return;
+
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+    case PL_COLOR_SYSTEM_XYZ:
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_YCGCO:
+        break; // no special pre-processing needed
+
+    case PL_COLOR_SYSTEM_COUNT:
+        pl_unreachable();
+    }
+
+    // Since this is a relatively rare operation, bypass it as much as possible
+    bool skip = true;
+    skip &= PL_DEF(repr->sys, PL_COLOR_SYSTEM_RGB) == PL_COLOR_SYSTEM_RGB;
+    skip &= PL_DEF(repr->levels, PL_COLOR_LEVELS_FULL) == PL_COLOR_LEVELS_FULL;
+    skip &= !repr->bits.sample_depth || !repr->bits.color_depth ||
+             repr->bits.sample_depth == repr->bits.color_depth;
+    skip &= !repr->bits.bit_shift;
+
+    if (!skip) {
+        struct pl_color_repr copy = *repr;
+        ident_t xyzscale = NULL_IDENT;
+        if (repr->sys == PL_COLOR_SYSTEM_XYZ)
+            xyzscale = SH_FLOAT(1.0 / pl_color_repr_normalize(&copy));
+
+        pl_transform3x3 tr = pl_color_repr_decode(&copy, NULL);
+        pl_transform3x3_invert(&tr);
+
+        ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_mat3("cmat"),
+            .data = PL_TRANSPOSE_3X3(tr.mat.m),
+        });
+
+        ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_vec3("cmat_c"),
+            .data = tr.c,
+        });
+
+        GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+
+        if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+            pl_shader_delinearize(sh, &(struct pl_color_space) {
+                .transfer = PL_COLOR_TRC_ST428,
+            });
+            GLSL("color.rgb *= vec3("$"); \n", xyzscale);
+        }
+    }
+
+    if (repr->alpha == PL_ALPHA_PREMULTIPLIED)
+        GLSL("color.rgb *= vec3(color.a); \n");
+
+    GLSL("}\n");
+}
+
+static ident_t sh_luma_coeffs(pl_shader sh, const struct pl_color_space *csp)
+{
+    pl_matrix3x3 rgb2xyz;
+    rgb2xyz = pl_get_rgb2xyz_matrix(pl_raw_primaries_get(csp->primaries));
+
+    // FIXME: Cannot use `const vec3` due to glslang bug #2025
+    ident_t coeffs = sh_fresh(sh, "luma_coeffs");
+    GLSLH("#define "$" vec3("$", "$", "$") \n", coeffs,
+          SH_FLOAT(rgb2xyz.m[1][0]), // RGB->Y vector
+          SH_FLOAT(rgb2xyz.m[1][1]),
+          SH_FLOAT(rgb2xyz.m[1][2]));
+    return coeffs;
+}
+
+void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    if (csp->transfer == PL_COLOR_TRC_LINEAR)
+        return;
+
+    float csp_min, csp_max;
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = csp,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_NORM,
+        .out_min    = &csp_min,
+        .out_max    = &csp_max,
+    ));
+
+    // Note that this clamp may technically violate the definition of
+    // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
+    // displayed on the display where such would be possible. That said, the
+    // problem is that not all gamma curves are well-defined on the values
+    // outside this range, so we ignore it and just clamp anyway for sanity.
+    GLSL("// pl_shader_linearize           \n"
+         "color.rgb = max(color.rgb, 0.0); \n");
+
+    switch (csp->transfer) {
+    case PL_COLOR_TRC_SRGB:
+        GLSL("color.rgb = mix(color.rgb * vec3(1.0/12.92),               \n"
+             "                pow((color.rgb + vec3(0.055))/vec3(1.055), \n"
+             "                    vec3(2.4)),                            \n"
+             "                lessThan(vec3(0.04045), color.rgb));       \n");
+        goto scale_out;
+    case PL_COLOR_TRC_BT_1886: {
+        const float lb = powf(csp_min, 1/2.4f);
+        const float lw = powf(csp_max, 1/2.4f);
+        const float a = powf(lw - lb, 2.4f);
+        const float b = lb / (lw - lb);
+        GLSL("color.rgb = "$" * pow(color.rgb + vec3("$"), vec3(2.4)); \n",
+             SH_FLOAT(a), SH_FLOAT(b));
+        return;
+    }
+    case PL_COLOR_TRC_GAMMA18:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.8));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA20:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.0));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_GAMMA22:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.2));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA24:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.4));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA26:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.6));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA28:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.8));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_PRO_PHOTO:
+        GLSL("color.rgb = mix(color.rgb * vec3(1.0/16.0),              \n"
+             "                pow(color.rgb, vec3(1.8)),               \n"
+             "                lessThan(vec3(0.03125), color.rgb));     \n");
+        goto scale_out;
+    case PL_COLOR_TRC_ST428:
+        GLSL("color.rgb = vec3(52.37/48.0) * pow(color.rgb, vec3(2.6));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_PQ:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/%f));         \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)        \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb); \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));         \n"
+             // PQ's output range is 0-10000, but we need it to be relative to
+             // to PL_COLOR_SDR_WHITE instead, so rescale
+             "color.rgb *= vec3(%f);                            \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, 10000.0 / PL_COLOR_SDR_WHITE);
+        return;
+    case PL_COLOR_TRC_HLG: {
+        const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+        const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+        // OETF^-1
+        GLSL("color.rgb = "$" * color.rgb + vec3("$");                  \n"
+             "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,        \n"
+             "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
+             "                    + vec3(%f),                           \n"
+             "                lessThan(vec3(0.5), color.rgb));          \n",
+             SH_FLOAT(1 - b), SH_FLOAT(b),
+             HLG_C, HLG_A, HLG_B);
+        // OOTF
+        GLSL("color.rgb *= 1.0 / 12.0;                                      \n"
+             "color.rgb *= "$" * pow(max(dot("$", color.rgb), 0.0), "$");   \n",
+             SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT(y - 1));
+        return;
+    }
+    case PL_COLOR_TRC_V_LOG:
+        GLSL("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
+             "    pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+             "              - vec3(%f),                                  \n"
+             "    lessThanEqual(vec3(0.181), color.rgb));                \n",
+             VLOG_D, VLOG_C, VLOG_B);
+        return;
+    case PL_COLOR_TRC_S_LOG1:
+        GLSL("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+             "            - vec3(%f);                                            \n",
+             SLOG_C, SLOG_A, SLOG_B);
+        return;
+    case PL_COLOR_TRC_S_LOG2:
+        GLSL("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f),      \n"
+             "    (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+             "              - vec3(%f)) * vec3(1.0/%f),                   \n"
+             "    lessThanEqual(vec3(%f), color.rgb));                    \n",
+             SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
+        return;
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+
+scale_out:
+    if (csp_max != 1 || csp_min != 0) {
+        GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+             SH_FLOAT(csp_max - csp_min), SH_FLOAT(csp_min));
+    }
+}
+
+void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    if (csp->transfer == PL_COLOR_TRC_LINEAR)
+        return;
+
+    float csp_min, csp_max;
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = csp,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_NORM,
+        .out_min    = &csp_min,
+        .out_max    = &csp_max,
+    ));
+
+    GLSL("// pl_shader_delinearize \n");
+    switch (csp->transfer) {
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_SRGB:
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_GAMMA18:
+    case PL_COLOR_TRC_GAMMA20:
+    case PL_COLOR_TRC_GAMMA22:
+    case PL_COLOR_TRC_GAMMA24:
+    case PL_COLOR_TRC_GAMMA26:
+    case PL_COLOR_TRC_GAMMA28:
+    case PL_COLOR_TRC_PRO_PHOTO:
+    case PL_COLOR_TRC_ST428: ;
+        if (csp_max != 1 || csp_min != 0) {
+            GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+                 SH_FLOAT(1 / (csp_max - csp_min)),
+                 SH_FLOAT(-csp_min / (csp_max - csp_min)));
+        }
+        break;
+    case PL_COLOR_TRC_BT_1886:
+    case PL_COLOR_TRC_PQ:
+    case PL_COLOR_TRC_HLG:
+    case PL_COLOR_TRC_V_LOG:
+    case PL_COLOR_TRC_S_LOG1:
+    case PL_COLOR_TRC_S_LOG2:
+        break; // scene-referred or absolute scale
+    case PL_COLOR_TRC_COUNT:
+        pl_unreachable();
+    }
+
+    GLSL("color.rgb = max(color.rgb, 0.0); \n");
+
+    switch (csp->transfer) {
+    case PL_COLOR_TRC_SRGB:
+        GLSL("color.rgb = mix(color.rgb * vec3(12.92),                        \n"
+             "                vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))     \n"
+             "                    - vec3(0.055),                              \n"
+             "                lessThanEqual(vec3(0.0031308), color.rgb));     \n");
+        return;
+    case PL_COLOR_TRC_BT_1886: {
+        const float lb = powf(csp_min, 1/2.4f);
+        const float lw = powf(csp_max, 1/2.4f);
+        const float a = powf(lw - lb, 2.4f);
+        const float b = lb / (lw - lb);
+        GLSL("color.rgb = pow("$" * color.rgb, vec3(1.0/2.4)) - vec3("$"); \n",
+             SH_FLOAT(1.0 / a), SH_FLOAT(b));
+        return;
+    }
+    case PL_COLOR_TRC_GAMMA18:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/1.8));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA20:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.0));\n");
+        return;
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_GAMMA22:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.2));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA24:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.4));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA26:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.6));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA28:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.8));\n");
+        return;
+    case PL_COLOR_TRC_ST428:
+        GLSL("color.rgb = pow(color.rgb * vec3(48.0/52.37), vec3(1.0/2.6));\n");
+        return;
+    case PL_COLOR_TRC_PRO_PHOTO:
+        GLSL("color.rgb = mix(color.rgb * vec3(16.0),                        \n"
+             "                pow(color.rgb, vec3(1.0/1.8)),                 \n"
+             "                lessThanEqual(vec3(0.001953), color.rgb));     \n");
+        return;
+    case PL_COLOR_TRC_PQ:
+        GLSL("color.rgb *= vec3(1.0/%f);                         \n"
+             "color.rgb = pow(color.rgb, vec3(%f));              \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)      \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+             "color.rgb = pow(color.rgb, vec3(%f));              \n",
+             10000 / PL_COLOR_SDR_WHITE, PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        return;
+    case PL_COLOR_TRC_HLG: {
+        const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+        const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+        // OOTF^-1
+        GLSL("color.rgb *= 1.0 / "$";                                       \n"
+             "color.rgb *= 12.0 * max(1e-6, pow(dot("$", color.rgb), "$")); \n",
+             SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT((1 - y) / y));
+        // OETF
+        GLSL("color.rgb = mix(vec3(0.5) * sqrt(color.rgb),                      \n"
+             "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),  \n"
+             "                lessThan(vec3(1.0), color.rgb));                  \n"
+             "color.rgb = "$" * color.rgb + vec3("$");                          \n",
+             HLG_A, HLG_B, HLG_C,
+             SH_FLOAT(1 / (1 - b)), SH_FLOAT(-b / (1 - b)));
+        return;
+    }
+    case PL_COLOR_TRC_V_LOG:
+        GLSL("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125),       \n"
+             "                vec3(%f) * log(color.rgb + vec3(%f))       \n"
+             "                    + vec3(%f),                            \n"
+             "                lessThanEqual(vec3(0.01), color.rgb));     \n",
+             VLOG_C / M_LN10, VLOG_B, VLOG_D);
+        return;
+    case PL_COLOR_TRC_S_LOG1:
+        GLSL("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
+             SLOG_A / M_LN10, SLOG_B, SLOG_C);
+        return;
+    case PL_COLOR_TRC_S_LOG2:
+        GLSL("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f),                \n"
+             "                vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
+             "                    + vec3(%f),                                 \n"
+             "                lessThanEqual(vec3(0.0), color.rgb));           \n",
+             SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
+        return;
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+const struct pl_sigmoid_params pl_sigmoid_default_params = { PL_SIGMOID_DEFAULTS };
+
+void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    params = PL_DEF(params, &pl_sigmoid_default_params);
+    float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+    float slope  = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+
+    // This function needs to go through (0,0) and (1,1), so we compute the
+    // values at 1 and 0, and then scale/shift them, respectively.
+    float offset = 1.0 / (1 + expf(slope * center));
+    float scale  = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+    GLSL("// pl_shader_sigmoidize                               \n"
+         "color = clamp(color, 0.0, 1.0);                       \n"
+         "color = vec4("$") - vec4("$") *                       \n"
+         "    log(vec4(1.0) / (color * vec4("$") + vec4("$"))   \n"
+         "        - vec4(1.0));                                 \n",
+         SH_FLOAT(center), SH_FLOAT(1.0 / slope),
+         SH_FLOAT(scale), SH_FLOAT(offset));
+}
+
+void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    // See: pl_shader_sigmoidize
+    params = PL_DEF(params, &pl_sigmoid_default_params);
+    float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+    float slope  = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+    float offset = 1.0 / (1 + expf(slope * center));
+    float scale  = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+    GLSL("// pl_shader_unsigmoidize                                 \n"
+         "color = clamp(color, 0.0, 1.0);                           \n"
+         "color = vec4("$") /                                       \n"
+         "    (vec4(1.0) + exp(vec4("$") * (vec4("$") - color)))    \n"
+         "    - vec4("$");                                          \n",
+         SH_FLOAT(1.0 / scale),
+         SH_FLOAT(slope), SH_FLOAT(center),
+         SH_FLOAT(offset / scale));
+}
+
+const struct pl_peak_detect_params pl_peak_detect_default_params = { PL_PEAK_DETECT_DEFAULTS };
+const struct pl_peak_detect_params pl_peak_detect_high_quality_params = { PL_PEAK_DETECT_HQ_DEFAULTS };
+
+static bool peak_detect_params_eq(const struct pl_peak_detect_params *a,
+                                  const struct pl_peak_detect_params *b)
+{
+    return a->smoothing_period     == b->smoothing_period     &&
+           a->scene_threshold_low  == b->scene_threshold_low  &&
+           a->scene_threshold_high == b->scene_threshold_high &&
+           a->percentile           == b->percentile;
+    // don't compare `allow_delayed` because it doesn't change measurement
+}
+
+enum {
+    // Split the peak buffer into several independent slices to reduce pressure
+    // on global atomics
+    SLICES = 12,
+
+    // How many bits to use for storing PQ data. Be careful when setting this
+    // too high, as it may overflow `unsigned int` on large video sources.
+    //
+    // The value chosen is enough to guarantee no overflow for an 8K x 4K frame
+    // consisting entirely of 100% 10k nits PQ values, with 16x16 workgroups.
+    PQ_BITS     = 14,
+    PQ_MAX      = (1 << PQ_BITS) - 1,
+
+    // How many bits to use for the histogram. We bias the histogram down
+    // by half the PQ range (~90 nits), effectively clumping the SDR part
+    // of the image into a single histogram bin.
+    HIST_BITS   = 7,
+    HIST_BIAS   = 1 << (HIST_BITS - 1),
+    HIST_BINS   = (1 << HIST_BITS) - HIST_BIAS,
+
+    // Convert from histogram bin to (starting) PQ value
+#define HIST_PQ(bin) (((bin) + HIST_BIAS) << (PQ_BITS - HIST_BITS))
+};
+
+
+pl_static_assert(PQ_BITS >= HIST_BITS);
+
+struct peak_buf_data {
+    unsigned frame_wg_count[SLICES]; // number of work groups processed
+    unsigned frame_wg_active[SLICES];// number of active (nonzero) work groups
+    unsigned frame_sum_pq[SLICES];   // sum of PQ Y values over all WGs (PQ_BITS)
+    unsigned frame_max_pq[SLICES];   // maximum PQ Y value among these WGs (PQ_BITS)
+    unsigned frame_hist[SLICES][HIST_BINS]; // always allocated, conditionally used
+};
+
+static const struct pl_buffer_var peak_buf_vars[] = {
+#define VAR(field) {                                                            \
+    .var = {                                                                    \
+        .name = #field,                                                         \
+        .type = PL_VAR_UINT,                                                    \
+        .dim_v = 1,                                                             \
+        .dim_m = 1,                                                             \
+        .dim_a = sizeof(((struct peak_buf_data *) NULL)->field) /               \
+                 sizeof(unsigned),                                              \
+    },                                                                          \
+    .layout = {                                                                 \
+        .offset = offsetof(struct peak_buf_data, field),                        \
+        .size   = sizeof(((struct peak_buf_data *) NULL)->field),               \
+        .stride = sizeof(unsigned),                                             \
+    },                                                                          \
+}
+    VAR(frame_wg_count),
+    VAR(frame_wg_active),
+    VAR(frame_sum_pq),
+    VAR(frame_max_pq),
+    VAR(frame_hist),
+#undef VAR
+};
+
+struct sh_color_map_obj {
+    // Tone map state
+    struct {
+        struct pl_tone_map_params params;
+        pl_shader_obj lut;
+    } tone;
+
+    // Gamut map state
+    struct {
+        pl_shader_obj lut;
+    } gamut;
+
+    // Peak detection state
+    struct {
+        struct pl_peak_detect_params params;    // currently active parameters
+        pl_buf buf;                             // pending peak detection buffer
+        pl_buf readback;                        // readback buffer (fallback)
+        float avg_pq;                           // current (smoothed) values
+        float max_pq;
+    } peak;
+};
+
+// Excluding size, since this is checked by sh_lut
+static uint64_t gamut_map_signature(const struct pl_gamut_map_params *par)
+{
+    uint64_t sig = CACHE_KEY_GAMUT_LUT;
+    pl_hash_merge(&sig, pl_str0_hash(par->function->name));
+    pl_hash_merge(&sig, pl_var_hash(par->input_gamut));
+    pl_hash_merge(&sig, pl_var_hash(par->output_gamut));
+    pl_hash_merge(&sig, pl_var_hash(par->min_luma));
+    pl_hash_merge(&sig, pl_var_hash(par->max_luma));
+    pl_hash_merge(&sig, pl_var_hash(par->constants));
+    return sig;
+}
+
+static void sh_color_map_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_color_map_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->tone.lut);
+    pl_shader_obj_destroy(&obj->gamut.lut);
+    pl_buf_destroy(gpu, &obj->peak.buf);
+    pl_buf_destroy(gpu, &obj->peak.readback);
+    memset(obj, 0, sizeof(*obj));
+}
+
+static inline float iir_coeff(float rate)
+{
+    if (!rate)
+        return 1.0f;
+    return 1.0f - expf(-1.0f / rate);
+}
+
+static float measure_peak(const struct peak_buf_data *data, float percentile)
+{
+    unsigned frame_max_pq = data->frame_max_pq[0];
+    for (int k = 1; k < SLICES; k++)
+        frame_max_pq = PL_MAX(frame_max_pq, data->frame_max_pq[k]);
+    const float frame_max = (float) frame_max_pq / PQ_MAX;
+    if (percentile <= 0 || percentile >= 100)
+        return frame_max;
+    unsigned total_pixels = 0;
+    for (int k = 0; k < SLICES; k++) {
+        for (int i = 0; i < HIST_BINS; i++)
+            total_pixels += data->frame_hist[k][i];
+    }
+    if (!total_pixels) // no histogram data available?
+        return frame_max;
+
+    const unsigned target_pixel = ceilf(percentile / 100.0f * total_pixels);
+    if (target_pixel >= total_pixels)
+        return frame_max;
+
+    unsigned sum = 0;
+    for (int i = 0; i < HIST_BINS; i++) {
+        unsigned next = sum;
+        for (int k = 0; k < SLICES; k++)
+            next += data->frame_hist[k][i];
+        if (next < target_pixel) {
+            sum = next;
+            continue;
+        }
+
+        // Upper and lower frequency boundaries of the matching histogram bin
+        const unsigned count_low  = sum;      // last pixel of previous bin
+        const unsigned count_high = next + 1; // first pixel of next bin
+        pl_assert(count_low < target_pixel && target_pixel < count_high);
+
+        // PQ luminance associated with count_low/high respectively
+        const float pq_low  = (float) HIST_PQ(i)     / PQ_MAX;
+        float pq_high       = (float) HIST_PQ(i + 1) / PQ_MAX;
+        if (count_high > total_pixels) // special case for last histogram bin
+            pq_high = frame_max;
+
+        // Position of `target_pixel` inside this bin, assumes pixels are
+        // equidistributed inside a histogram bin
+        const float ratio = (float) (target_pixel - count_low) /
+                                    (count_high - count_low);
+        return PL_MIX(pq_low, pq_high, ratio);
+    }
+
+    pl_unreachable();
+}
+
+// if `force` is true, ensures the buffer is read, even if `allow_delayed`
+static void update_peak_buf(pl_gpu gpu, struct sh_color_map_obj *obj, bool force)
+{
+    const struct pl_peak_detect_params *params = &obj->peak.params;
+    if (!obj->peak.buf)
+        return;
+
+    if (!force && params->allow_delayed && pl_buf_poll(gpu, obj->peak.buf, 0))
+        return; // buffer not ready yet
+
+    bool ok;
+    struct peak_buf_data data = {0};
+    if (obj->peak.readback) {
+        pl_buf_copy(gpu, obj->peak.readback, 0, obj->peak.buf, 0, sizeof(data));
+        ok = pl_buf_read(gpu, obj->peak.readback, 0, &data, sizeof(data));
+    } else {
+        ok = pl_buf_read(gpu, obj->peak.buf, 0, &data, sizeof(data));
+    }
+    if (ok && data.frame_wg_count[0] > 0) {
+        // Peak detection completed successfully
+        pl_buf_destroy(gpu, &obj->peak.buf);
+    } else {
+        // No data read? Possibly this peak obj has not been executed yet
+        if (!ok) {
+            PL_ERR(gpu, "Failed reading peak detection buffer!");
+        } else if (params->allow_delayed) {
+            PL_TRACE(gpu, "Peak detection buffer not yet ready, ignoring..");
+        } else {
+            PL_WARN(gpu, "Peak detection usage error: attempted detecting peak "
+                    "and using detected peak in the same shader program, "
+                    "but `params->allow_delayed` is false! Ignoring, but "
+                    "expect incorrect output.");
+        }
+        if (force || !ok)
+            pl_buf_destroy(gpu, &obj->peak.buf);
+        return;
+    }
+
+    uint64_t frame_sum_pq = 0u, frame_wg_count = 0u, frame_wg_active = 0u;
+    for (int k = 0; k < SLICES; k++) {
+        frame_sum_pq    += data.frame_sum_pq[k];
+        frame_wg_count  += data.frame_wg_count[k];
+        frame_wg_active += data.frame_wg_active[k];
+    }
+    float avg_pq, max_pq;
+    if (frame_wg_active) {
+        avg_pq = (float) frame_sum_pq / (frame_wg_active * PQ_MAX);
+        max_pq = measure_peak(&data, params->percentile);
+    } else {
+        // Solid black frame
+        avg_pq = max_pq = PL_COLOR_HDR_BLACK;
+    }
+
+    if (!obj->peak.avg_pq) {
+        // Set the initial value accordingly if it contains no data
+        obj->peak.avg_pq = avg_pq;
+        obj->peak.max_pq = max_pq;
+    } else {
+        // Ignore small deviations from existing peak (rounding error)
+        static const float epsilon = 1.0f / PQ_MAX;
+        if (fabsf(avg_pq - obj->peak.avg_pq) < epsilon)
+            avg_pq = obj->peak.avg_pq;
+        if (fabsf(max_pq - obj->peak.max_pq) < epsilon)
+            max_pq = obj->peak.max_pq;
+    }
+
+    // Use an IIR low-pass filter to smooth out the detected values
+    const float coeff = iir_coeff(params->smoothing_period);
+    obj->peak.avg_pq += coeff * (avg_pq - obj->peak.avg_pq);
+    obj->peak.max_pq += coeff * (max_pq - obj->peak.max_pq);
+
+    // Scene change hysteresis
+    if (params->scene_threshold_low > 0 && params->scene_threshold_high > 0) {
+        const float log10_pq = 1e-2f; // experimentally determined approximate
+        const float thresh_low = params->scene_threshold_low * log10_pq;
+        const float thresh_high = params->scene_threshold_high * log10_pq;
+        const float bias = (float) frame_wg_active / frame_wg_count;
+        const float delta = bias * fabsf(avg_pq - obj->peak.avg_pq);
+        const float mix_coeff = pl_smoothstep(thresh_low, thresh_high, delta);
+        obj->peak.avg_pq = PL_MIX(obj->peak.avg_pq, avg_pq, mix_coeff);
+        obj->peak.max_pq = PL_MIX(obj->peak.max_pq, max_pq, mix_coeff);
+    }
+}
+
+bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp,
+                           pl_shader_obj *state,
+                           const struct pl_peak_detect_params *params)
+{
+    params = PL_DEF(params, &pl_peak_detect_default_params);
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return false;
+
+    pl_gpu gpu = SH_GPU(sh);
+    if (!gpu || gpu->limits.max_ssbo_size < sizeof(struct peak_buf_data)) {
+        PL_ERR(sh, "HDR peak detection requires a GPU with support for at "
+               "least %zu bytes of SSBO data (supported: %zu)",
+               sizeof(struct peak_buf_data), gpu ? gpu->limits.max_ssbo_size : 0);
+        return false;
+    }
+
+    const bool use_histogram = params->percentile > 0 && params->percentile < 100;
+    size_t shmem_req = 3 * sizeof(uint32_t);
+    if (use_histogram)
+        shmem_req += sizeof(uint32_t[HIST_BINS]);
+
+    if (!sh_try_compute(sh, 16, 16, true, shmem_req)) {
+        PL_ERR(sh, "HDR peak detection requires compute shaders with support "
+               "for at least %zu bytes of shared memory! (avail: %zu)",
+               shmem_req, sh_glsl(sh).max_shmem_size);
+        return false;
+    }
+
+    struct sh_color_map_obj *obj;
+    obj = SH_OBJ(sh, state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+                 sh_color_map_uninit);
+    if (!obj)
+        return false;
+
+    if (peak_detect_params_eq(&obj->peak.params, params)) {
+        update_peak_buf(gpu, obj, true); // prevent over-writing previous frame
+    } else {
+        pl_reset_detected_peak(*state);
+    }
+
+    pl_assert(!obj->peak.buf);
+    static const struct peak_buf_data zero = {0};
+
+retry_ssbo:
+    if (obj->peak.readback) {
+        obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+            .size           = sizeof(struct peak_buf_data),
+            .storable       = true,
+            .initial_data   = &zero,
+        ));
+    } else {
+        obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+            .size           = sizeof(struct peak_buf_data),
+            .memory_type    = PL_BUF_MEM_DEVICE,
+            .host_readable  = true,
+            .storable       = true,
+            .initial_data   = &zero,
+        ));
+    }
+
+    if (!obj->peak.buf && !obj->peak.readback) {
+        PL_WARN(sh, "Failed creating host-readable peak detection SSBO, "
+                "retrying with fallback buffer");
+        obj->peak.readback = pl_buf_create(gpu, pl_buf_params(
+            .size           = sizeof(struct peak_buf_data),
+            .host_readable  = true,
+        ));
+        if (obj->peak.readback)
+            goto retry_ssbo;
+    }
+
+    if (!obj->peak.buf) {
+        SH_FAIL(sh, "Failed creating peak detection SSBO!");
+        return false;
+    }
+
+    obj->peak.params = *params;
+
+    sh_desc(sh, (struct pl_shader_desc) {
+        .desc = {
+            .name   = "PeakBuf",
+            .type   = PL_DESC_BUF_STORAGE,
+            .access = PL_DESC_ACCESS_READWRITE,
+        },
+        .binding.object  = obj->peak.buf,
+        .buffer_vars     = (struct pl_buffer_var *) peak_buf_vars,
+        .num_buffer_vars = PL_ARRAY_SIZE(peak_buf_vars),
+    });
+
+    sh_describe(sh, "peak detection");
+    GLSL("// pl_shader_detect_peak                                      \n"
+         "{                                                             \n"
+         "const uint wg_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y; \n"
+         "uint wg_idx = gl_WorkGroupID.y * gl_NumWorkGroups.x +         \n"
+         "              gl_WorkGroupID.x;                               \n"
+         "uint slice = wg_idx %% %du;                                   \n"
+         "vec4 color_orig = color;                                      \n",
+         SLICES);
+
+    // For performance, we want to do as few atomic operations on global
+    // memory as possible, so use an atomic in shmem for the work group.
+    ident_t wg_sum   = sh_fresh(sh, "wg_sum"),
+            wg_max   = sh_fresh(sh, "wg_max"),
+            wg_black = sh_fresh(sh, "wg_black"),
+            wg_hist  = NULL_IDENT;
+    GLSLH("shared uint "$", "$", "$"; \n", wg_sum, wg_max, wg_black);
+    if (use_histogram) {
+        wg_hist = sh_fresh(sh, "wg_hist");
+        GLSLH("shared uint "$"[%u]; \n", wg_hist, HIST_BINS);
+        GLSL("for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+             "    "$"[i] = 0u;                                              \n",
+             HIST_BINS, wg_hist);
+    }
+    GLSL($" = 0u; "$" = 0u; "$" = 0u; \n"
+         "barrier();                  \n",
+         wg_sum, wg_max, wg_black);
+
+    // Decode color into linear light representation
+    pl_color_space_infer(&csp);
+    pl_shader_linearize(sh, &csp);
+
+    // Measure luminance as N-bit PQ
+    GLSL("float luma = dot("$", color.rgb);             \n"
+         "luma *= %f;                                   \n"
+         "luma = pow(clamp(luma, 0.0, 1.0), %f);        \n"
+         "luma = (%f + %f * luma) / (1.0 + %f * luma);  \n"
+         "luma = pow(luma, %f);                         \n"
+         "luma *= smoothstep(0.0, 1e-2, luma);          \n"
+         "uint y_pq = uint(%d.0 * luma);                \n",
+         sh_luma_coeffs(sh, &csp),
+         PL_COLOR_SDR_WHITE / 10000.0,
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         PQ_MAX);
+
+    // Update the work group's shared atomics
+    bool has_subgroups = sh_glsl(sh).subgroup_size > 0;
+    if (use_histogram) {
+        GLSL("int bin = (int(y_pq) >> %d) - %d; \n"
+             "bin = clamp(bin, 0, %d);          \n",
+             PQ_BITS - HIST_BITS, HIST_BIAS,
+             HIST_BINS - 1);
+        if (has_subgroups) {
+            // Optimize for the very common case of identical histogram bins
+            GLSL("if (subgroupAllEqual(bin)) {                  \n"
+                 "    if (subgroupElect())                      \n"
+                 "        atomicAdd("$"[bin], gl_SubgroupSize); \n"
+                 "} else {                                      \n"
+                 "    atomicAdd("$"[bin], 1u);                  \n"
+                 "}                                             \n",
+                 wg_hist, wg_hist);
+        } else {
+            GLSL("atomicAdd("$"[bin], 1u); \n", wg_hist);
+        }
+    }
+
+    if (has_subgroups) {
+        GLSL("uint group_sum = subgroupAdd(y_pq);           \n"
+             "uint group_max = subgroupMax(y_pq);           \n"
+             "uvec4 b = subgroupBallot(y_pq == 0u);         \n"
+             "if (subgroupElect()) {                        \n"
+             "    atomicAdd("$", group_sum);                \n"
+             "    atomicMax("$", group_max);                \n"
+             "    atomicAdd("$", subgroupBallotBitCount(b));\n"
+             "}                                             \n"
+             "barrier();                                    \n",
+             wg_sum, wg_max, wg_black);
+    } else {
+        GLSL("atomicAdd("$", y_pq);     \n"
+             "atomicMax("$", y_pq);     \n"
+             "if (y_pq == 0u)           \n"
+             "    atomicAdd("$", 1u);   \n"
+             "barrier();                \n",
+             wg_sum, wg_max, wg_black);
+    }
+
+    if (use_histogram) {
+        GLSL("if (gl_LocalInvocationIndex == 0u)                            \n"
+             "    "$"[0] -= "$";                                            \n"
+             "for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+             "    atomicAdd(frame_hist[slice * %du + i], "$"[i]);           \n",
+             wg_hist, wg_black,
+             HIST_BINS,
+             HIST_BINS, wg_hist);
+    }
+
+    // Have one thread per work group update the global atomics
+    GLSL("if (gl_LocalInvocationIndex == 0u) {                  \n"
+         "    uint num = wg_size - "$";                         \n"
+         "    atomicAdd(frame_wg_count[slice], 1u);             \n"
+         "    atomicAdd(frame_wg_active[slice], min(num, 1u));  \n"
+         "    if (num > 0u) {                                   \n"
+         "        atomicAdd(frame_sum_pq[slice], "$" / num);    \n"
+         "        atomicMax(frame_max_pq[slice], "$");          \n"
+         "    }                                                 \n"
+         "}                                                     \n"
+         "color = color_orig;                                   \n"
+         "}                                                     \n",
+         wg_black, wg_sum, wg_max);
+
+    return true;
+}
+
+bool pl_get_detected_hdr_metadata(const pl_shader_obj state,
+                                  struct pl_hdr_metadata *out)
+{
+    if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+        return false;
+
+    struct sh_color_map_obj *obj = state->priv;
+    update_peak_buf(state->gpu, obj, false);
+    if (!obj->peak.avg_pq)
+        return false;
+
+    out->max_pq_y = obj->peak.max_pq;
+    out->avg_pq_y = obj->peak.avg_pq;
+    return true;
+}
+
+bool pl_get_detected_peak(const pl_shader_obj state,
+                          float *out_peak, float *out_avg)
+{
+    struct pl_hdr_metadata data;
+    if (!pl_get_detected_hdr_metadata(state, &data))
+        return false;
+
+    // Preserves old behavior
+    *out_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.max_pq_y);
+    *out_avg  = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.avg_pq_y);
+    return true;
+}
+
+void pl_reset_detected_peak(pl_shader_obj state)
+{
+    if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+        return;
+
+    struct sh_color_map_obj *obj = state->priv;
+    pl_buf readback = obj->peak.readback;
+    pl_buf_destroy(state->gpu, &obj->peak.buf);
+    memset(&obj->peak, 0, sizeof(obj->peak));
+    obj->peak.readback = readback;
+}
+
+void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "feature extraction");
+    pl_shader_linearize(sh, &csp);
+    GLSL("// pl_shader_extract_features             \n"
+         "{                                         \n"
+         "vec3 lms = %f * "$" * color.rgb;          \n"
+         "lms = pow(max(lms, 0.0), vec3(%f));       \n"
+         "lms = (vec3(%f) + %f * lms)               \n"
+         "        / (vec3(1.0) + %f * lms);         \n"
+         "lms = pow(lms, vec3(%f));                 \n"
+         "float I = dot(vec3(%f, %f, %f), lms);     \n"
+         "color = vec4(I, 0.0, 0.0, 1.0);           \n"
+         "}                                         \n",
+         PL_COLOR_SDR_WHITE / 10000,
+         SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))),
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]);
+}
+
+const struct pl_color_map_params pl_color_map_default_params = { PL_COLOR_MAP_DEFAULTS };
+const struct pl_color_map_params pl_color_map_high_quality_params = { PL_COLOR_MAP_HQ_DEFAULTS };
+
+static ident_t rect_pos(pl_shader sh, pl_rect2df rc)
+{
+    if (!rc.x0 && !rc.x1)
+        rc.x1 = 1.0f;
+    if (!rc.y0 && !rc.y1)
+        rc.y1 = 1.0f;
+
+    return sh_attr_vec2(sh, "tone_map_coords", &(pl_rect2df) {
+        .x0 = -rc.x0         / (rc.x1 - rc.x0),
+        .x1 = (1.0f - rc.x0) / (rc.x1 - rc.x0),
+        .y0 = -rc.y1         / (rc.y0 - rc.y1),
+        .y1 = (1.0f - rc.y1) / (rc.y0 - rc.y1),
+    });
+}
+
+static void visualize_tone_map(pl_shader sh, pl_rect2df rc, float alpha,
+                               const struct pl_tone_map_params *params)
+{
+    pl_assert(params->input_scaling  == PL_HDR_PQ);
+    pl_assert(params->output_scaling == PL_HDR_PQ);
+
+    GLSL("// Visualize tone mapping                 \n"
+         "{                                         \n"
+         "vec2 pos = "$";                           \n"
+         "if (min(pos.x, pos.y) >= 0.0 &&           \n" // visualizer rect
+         "    max(pos.x, pos.y) <= 1.0)             \n"
+         "{                                         \n"
+         "float xmin = "$";                         \n"
+         "float xmax = "$";                         \n"
+         "float xavg = "$";                         \n"
+         "float ymin = "$";                         \n"
+         "float ymax = "$";                         \n"
+         "float alpha = 0.8 * "$";                  \n"
+         "vec3 viz = color.rgb;                     \n"
+         "float vv = tone_map(pos.x);               \n"
+         // Color based on region
+         "if (pos.x < xmin || pos.x > xmax) {       \n" // outside source
+         "} else if (pos.y < ymin || pos.y > ymax) {\n" // outside target
+         "    if (pos.y < xmin || pos.y > xmax) {   \n" //  and also source
+         "        viz = vec3(0.1, 0.1, 0.5);        \n"
+         "    } else {                              \n"
+         "        viz = vec3(0.2, 0.05, 0.05);      \n" //  but inside source
+         "    }                                     \n"
+         "} else {                                  \n" // inside domain
+         "    if (abs(pos.x - pos.y) < 1e-3) {      \n" // main diagonal
+         "        viz = vec3(0.2);                  \n"
+         "    } else if (pos.y < vv) {              \n" // inside function
+         "        alpha *= 0.6;                     \n"
+         "        viz = vec3(0.05);                 \n"
+         "        if (vv > pos.x && pos.y > pos.x)  \n" // output brighter than input
+         "            viz.rg = vec2(0.5, 0.7);      \n"
+         "    } else {                              \n" // outside function
+         "        if (vv < pos.x && pos.y < pos.x)  \n" // output darker than input
+         "            viz = vec3(0.0, 0.1, 0.2);    \n"
+         "    }                                     \n"
+         "    if (pos.y > xmax) {                   \n" // inverse tone-mapping region
+         "        vec3 hi = vec3(0.2, 0.5, 0.8);    \n"
+         "        viz = mix(viz, hi, 0.5);          \n"
+         "    } else if (pos.y < xmin) {            \n" // black point region
+         "        viz = mix(viz, vec3(0.0), 0.3);   \n"
+         "    }                                     \n"
+         "    if (xavg > 0.0 && abs(pos.x - xavg) < 1e-3)\n" // source avg brightness
+         "        viz = vec3(0.5);                  \n"
+         "}                                         \n"
+         "color.rgb = mix(color.rgb, viz, alpha);   \n"
+         "}                                         \n"
+         "}                                         \n",
+         rect_pos(sh, rc),
+         SH_FLOAT_DYN(params->input_min),
+         SH_FLOAT_DYN(params->input_max),
+         SH_FLOAT_DYN(params->input_avg),
+         SH_FLOAT(params->output_min),
+         SH_FLOAT_DYN(params->output_max),
+         SH_FLOAT_DYN(alpha));
+}
+
+static void visualize_gamut_map(pl_shader sh, pl_rect2df rc,
+                                ident_t lut, float hue, float theta,
+                                const struct pl_gamut_map_params *params)
+{
+    ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+    ident_t lms2rgb_src = SH_MAT3(pl_ipt_lms2rgb(&params->input_gamut));
+    ident_t lms2rgb_dst = SH_MAT3(pl_ipt_lms2rgb(&params->output_gamut));
+
+    GLSL("// Visualize gamut mapping                            \n"
+         "vec2 pos = "$";                                       \n"
+         "float pqmin = "$";                                    \n"
+         "float pqmax = "$";                                    \n"
+         "float rgbmin = "$";                                   \n"
+         "float rgbmax = "$";                                   \n"
+         "vec3 orig = ipt;                                      \n"
+         "if (min(pos.x, pos.y) >= 0.0 &&                       \n"
+         "    max(pos.x, pos.y) <= 1.0)                         \n"
+         "{                                                     \n"
+         // Source color to visualize
+         "float mid = mix(pqmin, pqmax, 0.6);                   \n"
+         "vec3 base = vec3(0.5, 0.0, 0.0);                      \n"
+         "float hue = "$", theta = "$";                         \n"
+         "base.x = mix(base.x, mid, sin(theta));                \n"
+         "mat3 rot1 = mat3(1.0,    0.0,      0.0,               \n"
+         "                 0.0,  cos(hue), sin(hue),            \n"
+         "                 0.0, -sin(hue), cos(hue));           \n"
+         "mat3 rot2 = mat3( cos(theta), 0.0, sin(theta),        \n"
+         "                     0.0,     1.0,    0.0,            \n"
+         "                 -sin(theta), 0.0, cos(theta));       \n"
+         "vec3 dir = vec3(pos.yx - vec2(0.5), 0.0);             \n"
+         "ipt = base + rot1 * rot2 * dir;                       \n"
+         // Convert back to RGB (for gamut boundary testing)
+         "lmspq = "$" * ipt;                                    \n"
+         "lms = pow(max(lmspq, 0.0), vec3(1.0/%f));             \n"
+         "lms = max(lms - vec3(%f), 0.0)                        \n"
+         "             / (vec3(%f) - %f * lms);                 \n"
+         "lms = pow(lms, vec3(1.0/%f));                         \n"
+         "lms *= %f;                                            \n"
+         // Check against src/dst gamut boundaries
+         "vec3 rgbsrc = "$" * lms;                              \n"
+         "vec3 rgbdst = "$" * lms;                              \n"
+         "bool insrc, indst;                                    \n"
+         "insrc = all(lessThan(rgbsrc, vec3(rgbmax))) &&        \n"
+         "              all(greaterThan(rgbsrc, vec3(rgbmin))); \n"
+         "indst = all(lessThan(rgbdst, vec3(rgbmax))) &&        \n"
+         "              all(greaterThan(rgbdst, vec3(rgbmin))); \n"
+         // Sample from gamut mapping 3DLUT
+         "idx.x = (ipt.x - pqmin) / (pqmax - pqmin);            \n"
+         "idx.y = 2.0 * length(ipt.yz);                         \n"
+         "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;                \n"
+         "vec3 mapped = "$"(idx).xyz;                           \n"
+         "mapped.yz -= vec2(32768.0/65535.0);                   \n"
+         "float mappedhue = atan(mapped.z, mapped.y);           \n"
+         "float mappedchroma = length(mapped.yz);               \n"
+         "ipt = mapped;                                         \n"
+         // Visualize gamuts
+         "if (!insrc && !indst) {                               \n"
+         "    ipt = orig;                                       \n"
+         "} else if (insrc && !indst) {                         \n"
+         "    ipt.x -= 0.1;                                     \n"
+         "} else if (indst && !insrc) {                         \n"
+         "    ipt.x += 0.1;                                     \n"
+         "}                                                     \n"
+         // Visualize iso-luminance and iso-hue lines
+         "vec3 line;                                            \n"
+         "if (insrc && fract(50.0 * mapped.x) < 1e-1) {         \n"
+         "    float k = smoothstep(0.1, 0.0, abs(sin(theta)));  \n"
+         "    line.x = mix(mapped.x, 0.3, 0.5);                 \n"
+         "    line.yz = sqrt(length(mapped.yz)) *               \n"
+         "              normalize(mapped.yz);                   \n"
+         "    ipt = mix(ipt, line, k);                          \n"
+         "}                                                     \n"
+         "if (insrc && fract(10.0 * (mappedhue - hue)) < 1e-1) {\n"
+         "    float k = smoothstep(0.3, 0.0, abs(cos(theta)));  \n"
+         "    line.x = mapped.x - 0.05;                         \n"
+         "    line.yz = 1.2 * mapped.yz;                        \n"
+         "    ipt = mix(ipt, line, k);                          \n"
+         "}                                                     \n"
+         "if (insrc && fract(100.0 * mappedchroma) < 1e-1) {    \n"
+         "    line.x = mapped.x + 0.1;                          \n"
+         "    line.yz = 0.4 * mapped.yz;                        \n"
+         "    ipt = mix(ipt, line, 0.5);                        \n"
+         "}                                                     \n"
+         "}                                                     \n",
+         rect_pos(sh, rc),
+         SH_FLOAT(params->min_luma), SH_FLOAT(params->max_luma),
+         SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->min_luma)),
+         SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->max_luma)),
+         SH_FLOAT_DYN(hue), SH_FLOAT_DYN(theta),
+         ipt2lms,
+         PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+         10000 / PL_COLOR_SDR_WHITE,
+         lms2rgb_src,
+         lms2rgb_dst,
+         0.5f / M_PI,
+         lut);
+}
+
+static void fill_tone_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct pl_tone_map_params *lut_params = params->priv;
+    pl_tone_map_generate(data, lut_params);
+}
+
+static void fill_gamut_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct pl_gamut_map_params *lut_params = params->priv;
+    const int lut_size = params->width * params->height * params->depth;
+    void *tmp = pl_alloc(NULL, lut_size * sizeof(float) * lut_params->lut_stride);
+    pl_gamut_map_generate(tmp, lut_params);
+
+    // Convert to 16-bit unsigned integer for GPU texture
+    const float *in = tmp;
+    uint16_t *out = data;
+    pl_assert(lut_params->lut_stride == 3);
+    pl_assert(params->comps == 4);
+    for (int i = 0; i < lut_size; i++) {
+        out[0] = roundf(in[0] * UINT16_MAX);
+        out[1] = roundf(in[1] * UINT16_MAX + (UINT16_MAX >> 1));
+        out[2] = roundf(in[2] * UINT16_MAX + (UINT16_MAX >> 1));
+        in  += 3;
+        out += 4;
+    }
+
+    pl_free(tmp);
+}
+
+void pl_shader_color_map_ex(pl_shader sh, const struct pl_color_map_params *params,
+                            const struct pl_color_map_args *args)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    struct pl_color_space src = args->src, dst = args->dst;
+    pl_color_space_infer_map(&src, &dst);
+    if (pl_color_space_equal(&src, &dst)) {
+        if (args->prelinearized)
+            pl_shader_delinearize(sh, &dst);
+        return;
+    }
+
+    struct sh_color_map_obj *obj = NULL;
+    if (args->state) {
+        pl_get_detected_hdr_metadata(*args->state, &src.hdr);
+        obj = SH_OBJ(sh, args->state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+                     sh_color_map_uninit);
+        if (!obj)
+            return;
+    }
+
+    params = PL_DEF(params, &pl_color_map_default_params);
+    GLSL("// pl_shader_color_map \n"
+         "{                      \n");
+
+    struct pl_tone_map_params tone = {
+        .function       = PL_DEF(params->tone_mapping_function, &pl_tone_map_clip),
+        .constants      = params->tone_constants,
+        .param          = params->tone_mapping_param,
+        .input_scaling  = PL_HDR_PQ,
+        .output_scaling = PL_HDR_PQ,
+        .lut_size       = PL_DEF(params->lut_size, pl_color_map_default_params.lut_size),
+        .hdr            = src.hdr,
+    };
+
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &src,
+        .metadata   = params->metadata,
+        .scaling    = tone.input_scaling,
+        .out_min    = &tone.input_min,
+        .out_max    = &tone.input_max,
+        .out_avg    = &tone.input_avg,
+    ));
+
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &dst,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = tone.output_scaling,
+        .out_min    = &tone.output_min,
+        .out_max    = &tone.output_max,
+    ));
+
+    pl_tone_map_params_infer(&tone);
+
+    // Round sufficiently similar values
+    if (fabs(tone.input_max - tone.output_max) < 1e-6)
+        tone.output_max = tone.input_max;
+    if (fabs(tone.input_min - tone.output_min) < 1e-6)
+        tone.output_min = tone.input_min;
+
+    if (!params->inverse_tone_mapping) {
+        // Never exceed the source unless requested, but still allow
+        // black point adaptation
+        tone.output_max = PL_MIN(tone.output_max, tone.input_max);
+    }
+
+    const int *lut3d_size_def = pl_color_map_default_params.lut3d_size;
+    struct pl_gamut_map_params gamut = {
+        .function        = PL_DEF(params->gamut_mapping, &pl_gamut_map_clip),
+        .constants       = params->gamut_constants,
+        .input_gamut     = src.hdr.prim,
+        .output_gamut    = dst.hdr.prim,
+        .lut_size_I      = PL_DEF(params->lut3d_size[0], lut3d_size_def[0]),
+        .lut_size_C      = PL_DEF(params->lut3d_size[1], lut3d_size_def[1]),
+        .lut_size_h      = PL_DEF(params->lut3d_size[2], lut3d_size_def[2]),
+        .lut_stride      = 3,
+    };
+
+    float src_peak_static;
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &src,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_PQ,
+        .out_max    = &src_peak_static,
+    ));
+
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &dst,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_PQ,
+        .out_min    = &gamut.min_luma,
+        .out_max    = &gamut.max_luma,
+    ));
+
+    // Clip the gamut mapping output to the input gamut if disabled
+    if (!params->gamut_expansion && gamut.function->bidirectional) {
+        if (pl_primaries_compatible(&gamut.input_gamut, &gamut.output_gamut)) {
+            gamut.output_gamut = pl_primaries_clip(&gamut.output_gamut,
+                                                   &gamut.input_gamut);
+        }
+    }
+
+    // Backwards compatibility with older API
+    switch (params->gamut_mode) {
+    case PL_GAMUT_CLIP:
+        switch (params->intent) {
+        case PL_INTENT_AUTO:
+        case PL_INTENT_PERCEPTUAL:
+        case PL_INTENT_RELATIVE_COLORIMETRIC:
+            break; // leave default
+        case PL_INTENT_SATURATION:
+            gamut.function = &pl_gamut_map_saturation;
+            break;
+        case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+            gamut.function = &pl_gamut_map_absolute;
+            break;
+        }
+        break;
+    case PL_GAMUT_DARKEN:
+        gamut.function = &pl_gamut_map_darken;
+        break;
+    case PL_GAMUT_WARN:
+        gamut.function = &pl_gamut_map_highlight;
+        break;
+    case PL_GAMUT_DESATURATE:
+        gamut.function = &pl_gamut_map_desaturate;
+        break;
+    case PL_GAMUT_MODE_COUNT:
+        pl_unreachable();
+    }
+
+    bool can_fast = !params->force_tone_mapping_lut;
+    if (!args->state) {
+        // No state object provided, forcibly disable advanced methods
+        can_fast = true;
+        if (tone.function != &pl_tone_map_clip)
+            tone.function = &pl_tone_map_linear;
+        if (gamut.function != &pl_gamut_map_clip)
+            gamut.function = &pl_gamut_map_saturation;
+    }
+
+    pl_fmt gamut_fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+    if (!gamut_fmt) {
+        gamut.function = &pl_gamut_map_saturation;
+        can_fast = true;
+    }
+
+    bool need_tone_map = !pl_tone_map_params_noop(&tone);
+    bool need_gamut_map = !pl_gamut_map_params_noop(&gamut);
+
+    if (!args->prelinearized)
+        pl_shader_linearize(sh, &src);
+
+    pl_matrix3x3 rgb2lms = pl_ipt_rgb2lms(pl_raw_primaries_get(src.primaries));
+    pl_matrix3x3 lms2rgb = pl_ipt_lms2rgb(pl_raw_primaries_get(dst.primaries));
+    ident_t lms2ipt = SH_MAT3(pl_ipt_lms2ipt);
+    ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+
+    if (need_gamut_map && gamut.function == &pl_gamut_map_saturation && can_fast) {
+        const pl_matrix3x3 lms2src = pl_ipt_lms2rgb(&gamut.input_gamut);
+        const pl_matrix3x3 dst2lms = pl_ipt_rgb2lms(&gamut.output_gamut);
+        sh_describe(sh, "gamut map (saturation)");
+        pl_matrix3x3_mul(&lms2rgb, &dst2lms);
+        pl_matrix3x3_mul(&lms2rgb, &lms2src);
+        need_gamut_map = false;
+    }
+
+    // Fast path: simply convert between primaries (if needed)
+    if (!need_tone_map && !need_gamut_map) {
+        if (src.primaries != dst.primaries) {
+            sh_describe(sh, "colorspace conversion");
+            pl_matrix3x3_mul(&lms2rgb, &rgb2lms);
+            GLSL("color.rgb = "$" * color.rgb; \n", SH_MAT3(lms2rgb));
+        }
+        goto done;
+    }
+
+    // Full path: convert input from normalized RGB to IPT
+    GLSL("vec3 lms = "$" * color.rgb;               \n"
+         "vec3 lmspq = %f * lms;                    \n"
+         "lmspq = pow(max(lmspq, 0.0), vec3(%f));   \n"
+         "lmspq = (vec3(%f) + %f * lmspq)           \n"
+         "        / (vec3(1.0) + %f * lmspq);       \n"
+         "lmspq = pow(lmspq, vec3(%f));             \n"
+         "vec3 ipt = "$" * lmspq;                   \n"
+         "float i_orig = ipt.x;                     \n",
+         SH_MAT3(rgb2lms),
+         PL_COLOR_SDR_WHITE / 10000,
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         lms2ipt);
+
+    if (params->show_clipping) {
+        const float eps = 1e-6f;
+        GLSL("bool clip_hi, clip_lo;                            \n"
+             "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n"
+             "clip_lo = any(lessThan(color.rgb, vec3("$")));    \n"
+             "clip_hi = clip_hi || ipt.x > "$";                 \n"
+             "clip_lo = clip_lo || ipt.x < "$";                 \n",
+             SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps),
+             SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps),
+             SH_FLOAT_DYN(tone.input_max + eps),
+             SH_FLOAT(tone.input_min - eps));
+    }
+
+    if (need_tone_map) {
+        const struct pl_tone_map_function *fun = tone.function;
+        sh_describef(sh, "%s tone map (%.0f -> %.0f)", fun->name,
+                     pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.input_max),
+                     pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.output_max));
+
+        if (fun == &pl_tone_map_clip && can_fast) {
+
+            GLSL("#define tone_map(x) clamp((x), "$", "$") \n",
+                 SH_FLOAT(tone.input_min),
+                 SH_FLOAT_DYN(tone.input_max));
+
+        } else if (fun == &pl_tone_map_linear && can_fast) {
+
+            const float gain = tone.constants.exposure;
+            const float scale = tone.input_max - tone.input_min;
+
+            ident_t linfun = sh_fresh(sh, "linear_pq");
+            GLSLH("float "$"(float x) {                         \n"
+                 // Stretch the input range (while clipping)
+                 "    x = "$" * x + "$";                        \n"
+                 "    x = clamp(x, 0.0, 1.0);                   \n"
+                 "    x = "$" * x + "$";                        \n"
+                 "    return x;                                 \n"
+                 "}                                             \n",
+                 linfun,
+                 SH_FLOAT_DYN(gain / scale),
+                 SH_FLOAT_DYN(-gain / scale * tone.input_min),
+                 SH_FLOAT_DYN(tone.output_max - tone.output_min),
+                 SH_FLOAT(tone.output_min));
+
+            GLSL("#define tone_map(x) ("$"(x)) \n", linfun);
+
+        } else {
+
+            pl_assert(obj);
+            ident_t lut = sh_lut(sh, sh_lut_params(
+                .object     = &obj->tone.lut,
+                .var_type   = PL_VAR_FLOAT,
+                .lut_type   = SH_LUT_AUTO,
+                .method     = SH_LUT_LINEAR,
+                .width      = tone.lut_size,
+                .comps      = 1,
+                .update     = !pl_tone_map_params_equal(&tone, &obj->tone.params),
+                .dynamic    = tone.input_avg > 0, // dynamic metadata
+                .fill       = fill_tone_lut,
+                .priv       = &tone,
+            ));
+            obj->tone.params = tone;
+            if (!lut) {
+                SH_FAIL(sh, "Failed generating tone-mapping LUT!");
+                return;
+            }
+
+            const float lut_range = tone.input_max - tone.input_min;
+            GLSL("#define tone_map(x) ("$"("$" * (x) + "$")) \n",
+                 lut, SH_FLOAT_DYN(1.0f / lut_range),
+                 SH_FLOAT_DYN(-tone.input_min / lut_range));
+
+        }
+
+        bool need_recovery = tone.input_max >= tone.output_max;
+        if (need_recovery && params->contrast_recovery && args->feature_map) {
+            ident_t pos, pt;
+            ident_t lowres = sh_bind(sh, args->feature_map, PL_TEX_ADDRESS_CLAMP,
+                                     PL_TEX_SAMPLE_LINEAR, "feature_map",
+                                     NULL, &pos, &pt);
+
+            // Obtain HF detail map from bicubic interpolation of LF features
+            GLSL("vec2 lpos  = "$";                                 \n"
+                 "vec2 lpt   = "$";                                 \n"
+                 "vec2 lsize = vec2(textureSize("$", 0));           \n"
+                 "vec2 frac  = fract(lpos * lsize + vec2(0.5));     \n"
+                 "vec2 frac2 = frac * frac;                         \n"
+                 "vec2 inv   = vec2(1.0) - frac;                    \n"
+                 "vec2 inv2  = inv * inv;                           \n"
+                 "vec2 w0 = 1.0/6.0 * inv2 * inv;                   \n"
+                 "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);   \n"
+                 "vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);    \n"
+                 "vec2 w3 = 1.0/6.0 * frac2 * frac;                 \n"
+                 "vec4 g = vec4(w0 + w1, w2 + w3);                  \n"
+                 "vec4 h = vec4(w1, w3) / g + inv.xyxy;             \n"
+                 "h.xy -= vec2(2.0);                                \n"
+                 "vec4 p = lpos.xyxy + lpt.xyxy * h;                \n"
+                 "float l00 = textureLod("$", p.xy, 0.0).r;         \n"
+                 "float l01 = textureLod("$", p.xw, 0.0).r;         \n"
+                 "float l0 = mix(l01, l00, g.y);                    \n"
+                 "float l10 = textureLod("$", p.zy, 0.0).r;         \n"
+                 "float l11 = textureLod("$", p.zw, 0.0).r;         \n"
+                 "float l1 = mix(l11, l10, g.y);                    \n"
+                 "float luma = mix(l1, l0, g.x);                    \n"
+                 // Mix low-resolution tone mapped image with high-resolution
+                 // tone mapped image according to desired strength.
+                 "float highres = clamp(ipt.x, 0.0, 1.0);           \n"
+                 "float lowres = clamp(luma, 0.0, 1.0);             \n"
+                 "float detail = highres - lowres;                  \n"
+                 "float base = tone_map(highres);                   \n"
+                 "float sharp = tone_map(lowres) + detail;          \n"
+                 "ipt.x = clamp(mix(base, sharp, "$"), "$", "$");   \n",
+                 pos, pt, lowres,
+                 lowres, lowres, lowres, lowres,
+                 SH_FLOAT(params->contrast_recovery),
+                 SH_FLOAT(tone.output_min), SH_FLOAT_DYN(tone.output_max));
+
+        } else {
+
+            GLSL("ipt.x = tone_map(ipt.x); \n");
+        }
+
+        // Avoid raising saturation excessively when raising brightness, and
+        // also desaturate when reducing brightness greatly to account for the
+        // reduction in gamut volume.
+        GLSL("vec2 hull = vec2(i_orig, ipt.x);                  \n"
+             "hull = ((hull - 6.0) * hull + 9.0) * hull;        \n"
+             "ipt.yz *= min(i_orig / ipt.x, hull.y / hull.x);   \n");
+    }
+
+    if (need_gamut_map) {
+        const struct pl_gamut_map_function *fun = gamut.function;
+        sh_describef(sh, "gamut map (%s)", fun->name);
+
+        pl_assert(obj);
+        ident_t lut = sh_lut(sh, sh_lut_params(
+            .object     = &obj->gamut.lut,
+            .var_type   = PL_VAR_FLOAT,
+            .lut_type   = SH_LUT_TEXTURE,
+            .fmt        = gamut_fmt,
+            .method     = params->lut3d_tricubic ? SH_LUT_CUBIC : SH_LUT_LINEAR,
+            .width      = gamut.lut_size_I,
+            .height     = gamut.lut_size_C,
+            .depth      = gamut.lut_size_h,
+            .comps      = 4,
+            .signature  = gamut_map_signature(&gamut),
+            .cache      = SH_CACHE(sh),
+            .fill       = fill_gamut_lut,
+            .priv       = &gamut,
+        ));
+        if (!lut) {
+            SH_FAIL(sh, "Failed generating gamut-mapping LUT!");
+            return;
+        }
+
+        // 3D LUT lookup (in ICh space)
+        const float lut_range = gamut.max_luma - gamut.min_luma;
+        GLSL("vec3 idx;                             \n"
+             "idx.x = "$" * ipt.x + "$";            \n"
+             "idx.y = 2.0 * length(ipt.yz);         \n"
+             "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;\n"
+             "ipt = "$"(idx).xyz;                   \n"
+             "ipt.yz -= vec2(32768.0/65535.0);      \n",
+             SH_FLOAT(1.0f / lut_range),
+             SH_FLOAT(-gamut.min_luma / lut_range),
+             0.5f / M_PI, lut);
+
+        if (params->show_clipping) {
+            GLSL("clip_lo = clip_lo || any(lessThan(idx, vec3(0.0)));    \n"
+                 "clip_hi = clip_hi || any(greaterThan(idx, vec3(1.0))); \n");
+        }
+
+        if (params->visualize_lut) {
+            visualize_gamut_map(sh, params->visualize_rect, lut,
+                                params->visualize_hue, params->visualize_theta,
+                                &gamut);
+        }
+    }
+
+    // Convert IPT back to linear RGB
+    GLSL("lmspq = "$" * ipt;                        \n"
+         "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n"
+         "lms = max(lms - vec3(%f), 0.0)            \n"
+         "             / (vec3(%f) - %f * lms);     \n"
+         "lms = pow(lms, vec3(1.0/%f));             \n"
+         "lms *= %f;                                \n"
+         "color.rgb = "$" * lms;                    \n",
+         ipt2lms,
+         PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+         10000 / PL_COLOR_SDR_WHITE,
+         SH_MAT3(lms2rgb));
+
+    if (params->show_clipping) {
+        GLSL("if (clip_hi) {                                                \n"
+             "    float k = dot(color.rgb, vec3(2.0 / 3.0));                \n"
+             "    color.rgb = clamp(vec3(k) - color.rgb, 0.0, 1.0);         \n"
+             "    float cmin = min(min(color.r, color.g), color.b);         \n"
+             "    float cmax = max(max(color.r, color.g), color.b);         \n"
+             "    float delta = cmax - cmin;                                \n"
+             "    vec3 sat = smoothstep(cmin - 1e-6, cmax, color.rgb);      \n"
+             "    const vec3 red = vec3(1.0, 0.0, 0.0);                     \n"
+             "    color.rgb = mix(red, sat, smoothstep(0.0, 0.3, delta));   \n"
+             "} else if (clip_lo) {                                         \n"
+             "    vec3 hi = vec3(0.0, 0.3, 0.3);                            \n"
+             "    color.rgb = mix(color.rgb, hi, 0.5);                      \n"
+             "}                                                             \n");
+    }
+
+    if (need_tone_map) {
+        if (params->visualize_lut) {
+            float alpha = need_gamut_map ? powf(cosf(params->visualize_theta), 5.0f) : 1.0f;
+            visualize_tone_map(sh, params->visualize_rect, alpha, &tone);
+        }
+        GLSL("#undef tone_map \n");
+    }
+
+done:
+    pl_shader_delinearize(sh, &dst);
+    GLSL("}\n");
+}
+
+// Backwards compatibility wrapper around `pl_shader_color_map_ex`
+void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params,
+                         struct pl_color_space src, struct pl_color_space dst,
+                         pl_shader_obj *state, bool prelinearized)
+{
+    pl_shader_color_map_ex(sh, params, pl_color_map_args(
+        .src           = src,
+        .dst           = dst,
+        .prelinearized = prelinearized,
+        .state         = state,
+        .feature_map   = NULL
+    ));
+}
+
+void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp,
+                            const struct pl_cone_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+    if (!params || !params->cones)
+        return;
+
+    sh_describe(sh, "cone distortion");
+    GLSL("// pl_shader_cone_distort\n");
+    GLSL("{\n");
+
+    pl_color_space_infer(&csp);
+    pl_shader_linearize(sh, &csp);
+
+    pl_matrix3x3 cone_mat;
+    cone_mat = pl_get_cone_matrix(params, pl_raw_primaries_get(csp.primaries));
+    GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+        .var = pl_var_mat3("cone_mat"),
+        .data = PL_TRANSPOSE_3X3(cone_mat.m),
+    }));
+
+    pl_shader_delinearize(sh, &csp);
+    GLSL("}\n");
+}
diff --git a/src/shaders/custom.c b/src/shaders/custom.c
new file mode 100644
index 0000000..3f03e57
--- /dev/null
+++ b/src/shaders/custom.c
@@ -0,0 +1,89 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/custom.h>
+
+bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params)
+{
+    if (params->compute) {
+        int bw = PL_DEF(params->compute_group_size[0], 16);
+        int bh = PL_DEF(params->compute_group_size[1], 16);
+        bool flex = !params->compute_group_size[0] ||
+                    !params->compute_group_size[1];
+        if (!sh_try_compute(sh, bw, bh, flex, params->compute_shmem))
+            return false;
+    }
+
+    if (!sh_require(sh, params->input, params->output_w, params->output_h))
+        return false;
+
+    sh->output = params->output;
+
+    for (int i = 0; i < params->num_variables; i++) {
+        struct pl_shader_var sv = params->variables[i];
+        GLSLP("#define %s "$"\n", sv.var.name, sh_var(sh, sv));
+    }
+
+    for (int i = 0; i < params->num_descriptors; i++) {
+        struct pl_shader_desc sd = params->descriptors[i];
+        GLSLP("#define %s "$"\n", sd.desc.name, sh_desc(sh, sd));
+    }
+
+    for (int i = 0; i < params->num_vertex_attribs; i++) {
+        struct pl_shader_va sva = params->vertex_attribs[i];
+        GLSLP("#define %s "$"\n", sva.attr.name, sh_attr(sh, sva));
+    }
+
+    for (int i = 0; i < params->num_constants; i++) {
+        struct pl_shader_const sc = params->constants[i];
+        GLSLP("#define %s "$"\n", sc.name, sh_const(sh, sc));
+    }
+
+    if (params->prelude)
+        GLSLP("// pl_shader_custom prelude: \n%s\n", params->prelude);
+    if (params->header)
+        GLSLH("// pl_shader_custom header: \n%s\n", params->header);
+
+    if (params->description)
+        sh_describef(sh, "%s", params->description);
+
+    if (params->body) {
+        const char *output_decl = "";
+        if (params->output != params->input) {
+            switch (params->output) {
+            case PL_SHADER_SIG_NONE: break;
+            case PL_SHADER_SIG_COLOR:
+                output_decl = "vec4 color = vec4(0.0);";
+                break;
+
+            case PL_SHADER_SIG_SAMPLER:
+                pl_unreachable();
+            }
+        }
+
+        GLSL("// pl_shader_custom \n"
+             "%s                  \n"
+             "{                   \n"
+             "%s                  \n"
+             "}                   \n",
+             output_decl, params->body);
+    }
+
+    return true;
+}
diff --git a/src/shaders/custom_mpv.c b/src/shaders/custom_mpv.c
new file mode 100644
index 0000000..4ef0817
--- /dev/null
+++ b/src/shaders/custom_mpv.c
@@ -0,0 +1,1768 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "gpu.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/custom.h>
+
+// Hard-coded size limits, mainly for convenience (to avoid dynamic memory)
+#define SHADER_MAX_HOOKS 16
+#define SHADER_MAX_BINDS 16
+#define MAX_SHEXP_SIZE 32
+
+enum shexp_op {
+    SHEXP_OP_ADD,
+    SHEXP_OP_SUB,
+    SHEXP_OP_MUL,
+    SHEXP_OP_DIV,
+    SHEXP_OP_MOD,
+    SHEXP_OP_NOT,
+    SHEXP_OP_GT,
+    SHEXP_OP_LT,
+    SHEXP_OP_EQ,
+};
+
+enum shexp_tag {
+    SHEXP_END = 0, // End of an RPN expression
+    SHEXP_CONST, // Push a constant value onto the stack
+    SHEXP_TEX_W, // Get the width/height of a named texture (variable)
+    SHEXP_TEX_H,
+    SHEXP_OP2, // Pop two elements and push the result of a dyadic operation
+    SHEXP_OP1, // Pop one element and push the result of a monadic operation
+    SHEXP_VAR, // Arbitrary variable (e.g. shader parameters)
+};
+
+struct shexp {
+    enum shexp_tag tag;
+    union {
+        float cval;
+        pl_str varname;
+        enum shexp_op op;
+    } val;
+};
+
+struct custom_shader_hook {
+    // Variable/literal names of textures
+    pl_str pass_desc;
+    pl_str hook_tex[SHADER_MAX_HOOKS];
+    pl_str bind_tex[SHADER_MAX_BINDS];
+    pl_str save_tex;
+
+    // Shader body itself + metadata
+    pl_str pass_body;
+    float offset[2];
+    bool offset_align;
+    int comps;
+
+    // Special expressions governing the output size and execution conditions
+    struct shexp width[MAX_SHEXP_SIZE];
+    struct shexp height[MAX_SHEXP_SIZE];
+    struct shexp cond[MAX_SHEXP_SIZE];
+
+    // Special metadata for compute shaders
+    bool is_compute;
+    int block_w, block_h;       // Block size (each block corresponds to one WG)
+    int threads_w, threads_h;   // How many threads form a WG
+};
+
+static bool parse_rpn_shexpr(pl_str line, struct shexp out[MAX_SHEXP_SIZE])
+{
+    int pos = 0;
+
+    while (line.len > 0) {
+        pl_str word = pl_str_split_char(line, ' ', &line);
+        if (word.len == 0)
+            continue;
+
+        if (pos >= MAX_SHEXP_SIZE)
+            return false;
+
+        struct shexp *exp = &out[pos++];
+
+        if (pl_str_eatend0(&word, ".w") || pl_str_eatend0(&word, ".width")) {
+            exp->tag = SHEXP_TEX_W;
+            exp->val.varname = word;
+            continue;
+        }
+
+        if (pl_str_eatend0(&word, ".h") || pl_str_eatend0(&word, ".height")) {
+            exp->tag = SHEXP_TEX_H;
+            exp->val.varname = word;
+            continue;
+        }
+
+        switch (word.buf[0]) {
+        case '+': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_ADD; continue;
+        case '-': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_SUB; continue;
+        case '*': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MUL; continue;
+        case '/': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_DIV; continue;
+        case '%': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MOD; continue;
+        case '!': exp->tag = SHEXP_OP1; exp->val.op = SHEXP_OP_NOT; continue;
+        case '>': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_GT;  continue;
+        case '<': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_LT;  continue;
+        case '=': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_EQ;  continue;
+        }
+
+        if (word.buf[0] >= '0' && word.buf[0] <= '9') {
+            exp->tag = SHEXP_CONST;
+            if (!pl_str_parse_float(word, &exp->val.cval))
+                return false;
+            continue;
+        }
+
+        // Treat as generic variable
+        exp->tag = SHEXP_VAR;
+        exp->val.varname = word;
+    }
+
+    return true;
+}
+
+static inline pl_str split_magic(pl_str *body)
+{
+    pl_str ret = pl_str_split_str0(*body, "//!", body);
+    if (body->len) {
+        // Make sure the separator is included in the remainder
+        body->buf -= 3;
+        body->len += 3;
+    }
+
+    return ret;
+}
+
+static bool parse_hook(pl_log log, pl_str *body, struct custom_shader_hook *out)
+{
+    *out = (struct custom_shader_hook){
+        .pass_desc = pl_str0("unknown user shader"),
+        .width = {{ SHEXP_TEX_W, { .varname = pl_str0("HOOKED") }}},
+        .height = {{ SHEXP_TEX_H, { .varname = pl_str0("HOOKED") }}},
+        .cond = {{ SHEXP_CONST, { .cval = 1.0 }}},
+    };
+
+    int hook_idx = 0;
+    int bind_idx = 0;
+
+    // Parse all headers
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        // Check for the presence of the magic line beginning
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        // Parse the supported commands
+        if (pl_str_eatstart0(&line, "HOOK")) {
+            if (hook_idx == SHADER_MAX_HOOKS) {
+                pl_err(log, "Passes may only hook up to %d textures!",
+                       SHADER_MAX_HOOKS);
+                return false;
+            }
+            out->hook_tex[hook_idx++] = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "BIND")) {
+            if (bind_idx == SHADER_MAX_BINDS) {
+                pl_err(log, "Passes may only bind up to %d textures!",
+                       SHADER_MAX_BINDS);
+                return false;
+            }
+            out->bind_tex[bind_idx++] = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "SAVE")) {
+            pl_str save_tex = pl_str_strip(line);
+            if (pl_str_equals0(save_tex, "HOOKED")) {
+                // This is a special name that means "overwrite existing"
+                // texture, which we just signal by not having any `save_tex`
+                // name set.
+                out->save_tex = (pl_str) {0};
+            } else if (pl_str_equals0(save_tex, "MAIN")) {
+                // Compatibility alias
+                out->save_tex = pl_str0("MAINPRESUB");
+            } else {
+                out->save_tex = save_tex;
+            };
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DESC")) {
+            out->pass_desc = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "OFFSET")) {
+            line = pl_str_strip(line);
+            if (pl_str_equals0(line, "ALIGN")) {
+                out->offset_align = true;
+            } else {
+                if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[0]) ||
+                    !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[1]) ||
+                    line.len)
+                {
+                    pl_err(log, "Error while parsing OFFSET!");
+                    return false;
+                }
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "WIDTH")) {
+            if (!parse_rpn_shexpr(line, out->width)) {
+                pl_err(log, "Error while parsing WIDTH!");
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "HEIGHT")) {
+            if (!parse_rpn_shexpr(line, out->height)) {
+                pl_err(log, "Error while parsing HEIGHT!");
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "WHEN")) {
+            if (!parse_rpn_shexpr(line, out->cond)) {
+                pl_err(log, "Error while parsing WHEN!");
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "COMPONENTS")) {
+            if (!pl_str_parse_int(pl_str_strip(line), &out->comps)) {
+                pl_err(log, "Error parsing COMPONENTS: '%.*s'", PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "COMPUTE")) {
+            line = pl_str_strip(line);
+            bool ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_w) &&
+                      pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_h);
+
+            line = pl_str_strip(line);
+            if (ok && line.len) {
+                ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_w) &&
+                     pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_h) &&
+                     !line.len;
+            } else {
+                out->threads_w = out->block_w;
+                out->threads_h = out->block_h;
+            }
+
+            if (!ok) {
+                pl_err(log, "Error while parsing COMPUTE!");
+                return false;
+            }
+
+            out->is_compute = true;
+            continue;
+        }
+
+        // Unknown command type
+        pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+    }
+
+    // The rest of the file up until the next magic line beginning (if any)
+    // shall be the shader body
+    out->pass_body = split_magic(body);
+
+    // Sanity checking
+    if (hook_idx == 0)
+        pl_warn(log, "Pass has no hooked textures (will be ignored)!");
+
+    return true;
+}
+
+static bool parse_tex(pl_gpu gpu, void *alloc, pl_str *body,
+                      struct pl_shader_desc *out)
+{
+    *out = (struct pl_shader_desc) {
+        .desc = {
+            .name = "USER_TEX",
+            .type = PL_DESC_SAMPLED_TEX,
+        },
+    };
+
+    struct pl_tex_params params = {
+        .w = 1, .h = 1, .d = 0,
+        .sampleable = true,
+        .debug_tag = PL_DEBUG_TAG,
+    };
+
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (pl_str_eatstart0(&line, "TEXTURE")) {
+            out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "SIZE")) {
+            line = pl_str_strip(line);
+            int dims = 0;
+            int dim[4]; // extra space to catch invalid extra entries
+            while (line.len && dims < PL_ARRAY_SIZE(dim)) {
+                if (!pl_str_parse_int(pl_str_split_char(line, ' ', &line), &dim[dims++])) {
+                    PL_ERR(gpu, "Error while parsing SIZE!");
+                    return false;
+                }
+            }
+
+            uint32_t lim = dims == 1 ? gpu->limits.max_tex_1d_dim
+                         : dims == 2 ? gpu->limits.max_tex_2d_dim
+                         : dims == 3 ? gpu->limits.max_tex_3d_dim
+                         : 0;
+
+            // Sanity check against GPU size limits
+            switch (dims) {
+            case 3:
+                params.d = dim[2];
+                if (params.d < 1 || params.d > lim) {
+                    PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+                           params.d, lim);
+                    return false;
+                }
+                // fall through
+            case 2:
+                params.h = dim[1];
+                if (params.h < 1 || params.h > lim) {
+                    PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+                           params.h, lim);
+                    return false;
+                }
+                // fall through
+            case 1:
+                params.w = dim[0];
+                if (params.w < 1 || params.w > lim) {
+                    PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+                           params.w, lim);
+                    return false;
+                }
+                break;
+
+            default:
+                PL_ERR(gpu, "Invalid number of texture dimensions!");
+                return false;
+            };
+
+            // Clear out the superfluous components
+            if (dims < 3)
+                params.d = 0;
+            if (dims < 2)
+                params.h = 0;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "FORMAT")) {
+            line = pl_str_strip(line);
+            params.format = NULL;
+            for (int n = 0; n < gpu->num_formats; n++) {
+                pl_fmt fmt = gpu->formats[n];
+                if (pl_str_equals0(line, fmt->name)) {
+                    params.format = fmt;
+                    break;
+                }
+            }
+
+            if (!params.format || params.format->opaque) {
+                PL_ERR(gpu, "Unrecognized/unavailable FORMAT name: '%.*s'!",
+                       PL_STR_FMT(line));
+                return false;
+            }
+
+            if (!(params.format->caps & PL_FMT_CAP_SAMPLEABLE)) {
+                PL_ERR(gpu, "Chosen FORMAT '%.*s' is not sampleable!",
+                       PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "FILTER")) {
+            line = pl_str_strip(line);
+            if (pl_str_equals0(line, "LINEAR")) {
+                out->binding.sample_mode = PL_TEX_SAMPLE_LINEAR;
+            } else if (pl_str_equals0(line, "NEAREST")) {
+                out->binding.sample_mode = PL_TEX_SAMPLE_NEAREST;
+            } else {
+                PL_ERR(gpu, "Unrecognized FILTER: '%.*s'!", PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "BORDER")) {
+            line = pl_str_strip(line);
+            if (pl_str_equals0(line, "CLAMP")) {
+                out->binding.address_mode = PL_TEX_ADDRESS_CLAMP;
+            } else if (pl_str_equals0(line, "REPEAT")) {
+                out->binding.address_mode = PL_TEX_ADDRESS_REPEAT;
+            } else if (pl_str_equals0(line, "MIRROR")) {
+                out->binding.address_mode = PL_TEX_ADDRESS_MIRROR;
+            } else {
+                PL_ERR(gpu, "Unrecognized BORDER: '%.*s'!", PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "STORAGE")) {
+            params.storable = true;
+            out->desc.type = PL_DESC_STORAGE_IMG;
+            out->desc.access = PL_DESC_ACCESS_READWRITE;
+            out->memory = PL_MEMORY_COHERENT;
+            continue;
+        }
+
+        PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+    }
+
+    if (!params.format) {
+        PL_ERR(gpu, "No FORMAT specified!");
+        return false;
+    }
+
+    int caps = params.format->caps;
+    if (out->binding.sample_mode == PL_TEX_SAMPLE_LINEAR && !(caps & PL_FMT_CAP_LINEAR)) {
+        PL_ERR(gpu, "The specified texture format cannot be linear filtered!");
+        return false;
+    }
+
+    // Decode the rest of the section (up to the next //! marker) as raw hex
+    // data for the texture
+    pl_str tex, hexdata = split_magic(body);
+    if (!pl_str_decode_hex(NULL, pl_str_strip(hexdata), &tex)) {
+        PL_ERR(gpu, "Error while parsing TEXTURE body: must be a valid "
+                    "hexadecimal sequence!");
+        return false;
+    }
+
+    int texels = params.w * PL_DEF(params.h, 1) * PL_DEF(params.d, 1);
+    size_t expected_len = texels * params.format->texel_size;
+    if (tex.len == 0 && params.storable) {
+        // In this case, it's okay that the texture has no initial data
+        pl_free_ptr(&tex.buf);
+    } else if (tex.len != expected_len) {
+        PL_ERR(gpu, "Shader TEXTURE size mismatch: got %zu bytes, expected %zu!",
+               tex.len, expected_len);
+        pl_free(tex.buf);
+        return false;
+    }
+
+    params.initial_data = tex.buf;
+    out->binding.object = pl_tex_create(gpu, &params);
+    pl_free(tex.buf);
+
+    if (!out->binding.object) {
+        PL_ERR(gpu, "Failed creating custom texture!");
+        return false;
+    }
+
+    return true;
+}
+
+static bool parse_buf(pl_gpu gpu, void *alloc, pl_str *body,
+                      struct pl_shader_desc *out)
+{
+    *out = (struct pl_shader_desc) {
+        .desc = {
+            .name = "USER_BUF",
+            .type = PL_DESC_BUF_UNIFORM,
+        },
+    };
+
+    // Temporary, to allow deferring variable placement until all headers
+    // have been processed (in order to e.g. determine buffer type)
+    void *tmp = pl_tmp(alloc); // will be freed automatically on failure
+    PL_ARRAY(struct pl_var) vars = {0};
+
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (pl_str_eatstart0(&line, "BUFFER")) {
+            out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "STORAGE")) {
+            out->desc.type = PL_DESC_BUF_STORAGE;
+            out->desc.access = PL_DESC_ACCESS_READWRITE;
+            out->memory = PL_MEMORY_COHERENT;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "VAR")) {
+            pl_str type_name = pl_str_split_char(pl_str_strip(line), ' ', &line);
+            struct pl_var var = {0};
+            for (const struct pl_named_var *nv = pl_var_glsl_types; nv->glsl_name; nv++) {
+                if (pl_str_equals0(type_name, nv->glsl_name)) {
+                    var = nv->var;
+                    break;
+                }
+            }
+
+            if (!var.type) {
+                // No type found
+                PL_ERR(gpu, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(type_name));
+                return false;
+            }
+
+            pl_str var_name = pl_str_split_char(line, '[', &line);
+            if (line.len > 0) {
+                // Parse array dimension
+                if (!pl_str_parse_int(pl_str_split_char(line, ']', NULL), &var.dim_a)) {
+                    PL_ERR(gpu, "Failed parsing array dimension from [%.*s!",
+                           PL_STR_FMT(line));
+                    return false;
+                }
+
+                if (var.dim_a < 1) {
+                    PL_ERR(gpu, "Invalid array dimension %d!", var.dim_a);
+                    return false;
+                }
+            }
+
+            var.name = pl_strdup0(alloc, pl_str_strip(var_name));
+            PL_ARRAY_APPEND(tmp, vars, var);
+            continue;
+        }
+
+        PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+    }
+
+    // Try placing all of the buffer variables
+    for (int i = 0; i < vars.num; i++) {
+        if (!sh_buf_desc_append(alloc, gpu, out, NULL, vars.elem[i])) {
+            PL_ERR(gpu, "Custom buffer exceeds GPU limitations!");
+            return false;
+        }
+    }
+
+    // Decode the rest of the section (up to the next //! marker) as raw hex
+    // data for the buffer
+    pl_str data, hexdata = split_magic(body);
+    if (!pl_str_decode_hex(tmp, pl_str_strip(hexdata), &data)) {
+        PL_ERR(gpu, "Error while parsing BUFFER body: must be a valid "
+                    "hexadecimal sequence!");
+        return false;
+    }
+
+    size_t buf_size = sh_buf_desc_size(out);
+    if (data.len == 0 && out->desc.type == PL_DESC_BUF_STORAGE) {
+        // In this case, it's okay that the buffer has no initial data
+    } else if (data.len != buf_size) {
+        PL_ERR(gpu, "Shader BUFFER size mismatch: got %zu bytes, expected %zu!",
+               data.len, buf_size);
+        return false;
+    }
+
+    out->binding.object = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .uniform = out->desc.type == PL_DESC_BUF_UNIFORM,
+        .storable = out->desc.type == PL_DESC_BUF_STORAGE,
+        .initial_data = data.len ? data.buf : NULL,
+    ));
+
+    if (!out->binding.object) {
+        PL_ERR(gpu, "Failed creating custom buffer!");
+        return false;
+    }
+
+    pl_free(tmp);
+    return true;
+}
+
+static bool parse_var(pl_log log, pl_str str, enum pl_var_type type, pl_var_data *out)
+{
+    if (!str.len)
+        return true;
+
+    pl_str buf = str;
+    bool ok = false;
+    switch (type) {
+    case PL_VAR_SINT:
+        ok = pl_str_parse_int(pl_str_split_char(buf, ' ', &buf), &out->i);
+        break;
+    case PL_VAR_UINT:
+        ok = pl_str_parse_uint(pl_str_split_char(buf, ' ', &buf), &out->u);
+        break;
+    case PL_VAR_FLOAT:
+        ok = pl_str_parse_float(pl_str_split_char(buf, ' ', &buf), &out->f);
+        break;
+    case PL_VAR_INVALID:
+    case PL_VAR_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    if (pl_str_strip(buf).len > 0)
+        ok = false; // left-over garbage
+
+    if (!ok) {
+        pl_err(log, "Failed parsing variable data: %.*s", PL_STR_FMT(str));
+        return false;
+    }
+
+    return true;
+}
+
+static bool check_bounds(pl_log log, enum pl_var_type type, const pl_var_data data,
+                         const pl_var_data minimum, const pl_var_data maximum)
+{
+#define CHECK_BOUNDS(v, fmt) do                                                 \
+{                                                                               \
+    if (data.v < minimum.v) {                                                   \
+        pl_err(log, "Initial value "fmt" below declared minimum "fmt"!",        \
+                data.v, minimum.v);                                             \
+        return false;                                                           \
+    }                                                                           \
+    if (data.v > maximum.v) {                                                   \
+        pl_err(log, "Initial value "fmt" above declared maximum "fmt"!",        \
+                data.v, maximum.v);                                             \
+        return false;                                                           \
+    }                                                                           \
+} while (0)
+
+    switch (type) {
+    case PL_VAR_SINT:
+        CHECK_BOUNDS(i, "%d");
+        break;
+    case PL_VAR_UINT:
+        CHECK_BOUNDS(u, "%u");
+        break;
+    case PL_VAR_FLOAT:
+        CHECK_BOUNDS(f, "%f");
+        break;
+    case PL_VAR_INVALID:
+    case PL_VAR_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+#undef CHECK_BOUNDS
+    return true;
+}
+
+static bool parse_param(pl_log log, void *alloc, pl_str *body,
+                        struct pl_hook_par *out)
+{
+    *out = (struct pl_hook_par) {0};
+    pl_str minimum = {0};
+    pl_str maximum = {0};
+    bool is_enum = false;
+
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (pl_str_eatstart0(&line, "PARAM")) {
+            out->name = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DESC")) {
+            out->description = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "MINIMUM")) {
+            minimum = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "MAXIMUM")) {
+            maximum = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "TYPE")) {
+            line = pl_str_strip(line);
+            is_enum = pl_str_eatstart0(&line, "ENUM");
+            line = pl_str_strip(line);
+            if (pl_str_eatstart0(&line, "DYNAMIC")) {
+                out->mode = PL_HOOK_PAR_DYNAMIC;
+            } else if (pl_str_eatstart0(&line, "CONSTANT")) {
+                out->mode = PL_HOOK_PAR_CONSTANT;
+            } else if (pl_str_eatstart0(&line, "DEFINE")) {
+                out->mode = PL_HOOK_PAR_DEFINE;
+                out->type = PL_VAR_SINT;
+                if (pl_str_strip(line).len > 0) {
+                    pl_err(log, "TYPE DEFINE does not take any extra arguments, "
+                           "unexpected: '%.*s'", PL_STR_FMT(line));
+                    return false;
+                }
+                continue;
+            } else {
+                out->mode = PL_HOOK_PAR_VARIABLE;
+            }
+
+            line = pl_str_strip(line);
+            for (const struct pl_named_var *nv = pl_var_glsl_types;
+                 nv->glsl_name; nv++)
+            {
+                if (pl_str_equals0(line, nv->glsl_name)) {
+                    if (nv->var.dim_v > 1 || nv->var.dim_m > 1) {
+                        pl_err(log, "GLSL type '%s' is incompatible with "
+                               "shader parameters, must be scalar type!",
+                               nv->glsl_name);
+                        return false;
+                    }
+
+                    out->type = nv->var.type;
+                    if (is_enum && out->type != PL_VAR_SINT) {
+                        pl_err(log, "ENUM is only compatible with type int/DEFINE!");
+                        return false;
+                    }
+                    goto next;
+                }
+            }
+
+            pl_err(log, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(line));
+            return false;
+        }
+
+        pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+
+next: ;
+    }
+
+    switch (out->type) {
+    case PL_VAR_INVALID:
+        pl_err(log, "Missing variable type!");
+        return false;
+    case PL_VAR_SINT:
+        out->minimum.i = INT_MIN;
+        out->maximum.i = INT_MAX;
+        break;
+    case PL_VAR_UINT:
+        out->minimum.u = 0;
+        out->maximum.u = UINT_MAX;
+        break;
+    case PL_VAR_FLOAT:
+        out->minimum.f = -INFINITY;
+        out->maximum.f = INFINITY;
+        break;
+    case PL_VAR_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    pl_str initial = pl_str_strip(split_magic(body));
+    if (!initial.len) {
+        pl_err(log, "Missing initial parameter value!");
+        return false;
+    }
+
+    if (is_enum) {
+        PL_ARRAY(const char *) names = {0};
+        pl_assert(out->type == PL_VAR_SINT);
+        do {
+            pl_str line = pl_str_strip(pl_str_getline(initial, &initial));
+            if (!line.len)
+                continue;
+            PL_ARRAY_APPEND(alloc, names, pl_strdup0(alloc, line));
+        } while (initial.len);
+
+        pl_assert(names.num >= 1);
+        out->initial.i = 0;
+        out->minimum.i = 0;
+        out->maximum.i = names.num - 1;
+        out->names = names.elem;
+    } else {
+        if (!parse_var(log, initial, out->type, &out->initial))
+            return false;
+        if (!parse_var(log, minimum, out->type, &out->minimum))
+            return false;
+        if (!parse_var(log, maximum, out->type, &out->maximum))
+            return false;
+        if (!check_bounds(log, out->type, out->initial, out->minimum, out->maximum))
+            return false;
+    }
+
+    out->data = pl_memdup(alloc, &out->initial, sizeof(out->initial));
+    return true;
+}
+
+static enum pl_hook_stage mp_stage_to_pl(pl_str stage)
+{
+    if (pl_str_equals0(stage, "RGB"))
+        return PL_HOOK_RGB_INPUT;
+    if (pl_str_equals0(stage, "LUMA"))
+        return PL_HOOK_LUMA_INPUT;
+    if (pl_str_equals0(stage, "CHROMA"))
+        return PL_HOOK_CHROMA_INPUT;
+    if (pl_str_equals0(stage, "ALPHA"))
+        return PL_HOOK_ALPHA_INPUT;
+    if (pl_str_equals0(stage, "XYZ"))
+        return PL_HOOK_XYZ_INPUT;
+
+    if (pl_str_equals0(stage, "CHROMA_SCALED"))
+        return PL_HOOK_CHROMA_SCALED;
+    if (pl_str_equals0(stage, "ALPHA_SCALED"))
+        return PL_HOOK_ALPHA_SCALED;
+
+    if (pl_str_equals0(stage, "NATIVE"))
+        return PL_HOOK_NATIVE;
+    if (pl_str_equals0(stage, "MAINPRESUB"))
+        return PL_HOOK_RGB;
+    if (pl_str_equals0(stage, "MAIN"))
+        return PL_HOOK_RGB; // Note: conflicts with above!
+
+    if (pl_str_equals0(stage, "LINEAR"))
+        return PL_HOOK_LINEAR;
+    if (pl_str_equals0(stage, "SIGMOID"))
+        return PL_HOOK_SIGMOID;
+    if (pl_str_equals0(stage, "PREKERNEL"))
+        return PL_HOOK_PRE_KERNEL;
+    if (pl_str_equals0(stage, "POSTKERNEL"))
+        return PL_HOOK_POST_KERNEL;
+
+    if (pl_str_equals0(stage, "SCALED"))
+        return PL_HOOK_SCALED;
+    if (pl_str_equals0(stage, "PREOUTPUT"))
+        return PL_HOOK_PRE_OUTPUT;
+    if (pl_str_equals0(stage, "OUTPUT"))
+        return PL_HOOK_OUTPUT;
+
+    return 0;
+}
+
+static pl_str pl_stage_to_mp(enum pl_hook_stage stage)
+{
+    switch (stage) {
+    case PL_HOOK_RGB_INPUT:     return pl_str0("RGB");
+    case PL_HOOK_LUMA_INPUT:    return pl_str0("LUMA");
+    case PL_HOOK_CHROMA_INPUT:  return pl_str0("CHROMA");
+    case PL_HOOK_ALPHA_INPUT:   return pl_str0("ALPHA");
+    case PL_HOOK_XYZ_INPUT:     return pl_str0("XYZ");
+
+    case PL_HOOK_CHROMA_SCALED: return pl_str0("CHROMA_SCALED");
+    case PL_HOOK_ALPHA_SCALED:  return pl_str0("ALPHA_SCALED");
+
+    case PL_HOOK_NATIVE:        return pl_str0("NATIVE");
+    case PL_HOOK_RGB:           return pl_str0("MAINPRESUB");
+
+    case PL_HOOK_LINEAR:        return pl_str0("LINEAR");
+    case PL_HOOK_SIGMOID:       return pl_str0("SIGMOID");
+    case PL_HOOK_PRE_KERNEL:    return pl_str0("PREKERNEL");
+    case PL_HOOK_POST_KERNEL:   return pl_str0("POSTKERNEL");
+
+    case PL_HOOK_SCALED:        return pl_str0("SCALED");
+    case PL_HOOK_PRE_OUTPUT:    return pl_str0("PREOUTPUT");
+    case PL_HOOK_OUTPUT:        return pl_str0("OUTPUT");
+    };
+
+    pl_unreachable();
+}
+
+struct hook_pass {
+    enum pl_hook_stage exec_stages;
+    struct custom_shader_hook hook;
+};
+
+struct pass_tex {
+    pl_str name;
+    pl_tex tex;
+
+    // Metadata
+    pl_rect2df rect;
+    struct pl_color_repr repr;
+    struct pl_color_space color;
+    int comps;
+};
+
+struct hook_priv {
+    pl_log log;
+    pl_gpu gpu;
+    void *alloc;
+
+    PL_ARRAY(struct hook_pass) hook_passes;
+    PL_ARRAY(struct pl_hook_par) hook_params;
+
+    // Fixed (for shader-local resources)
+    PL_ARRAY(struct pl_shader_desc) descriptors;
+
+    // Dynamic per pass
+    enum pl_hook_stage save_stages;
+    PL_ARRAY(struct pass_tex) pass_textures;
+    pl_shader trc_helper;
+
+    // State for PRNG/frame count
+    int frame_count;
+    uint64_t prng_state[4];
+};
+
+static void hook_reset(void *priv)
+{
+    struct hook_priv *p = priv;
+    p->pass_textures.num = 0;
+}
+
+// Context during execution of a hook
+struct hook_ctx {
+    struct hook_priv *priv;
+    const struct pl_hook_params *params;
+    struct pass_tex hooked;
+};
+
+static bool lookup_tex(struct hook_ctx *ctx, pl_str var, float size[2])
+{
+    struct hook_priv *p = ctx->priv;
+    const struct pl_hook_params *params = ctx->params;
+
+    if (pl_str_equals0(var, "HOOKED")) {
+        pl_assert(ctx->hooked.tex);
+        size[0] = ctx->hooked.tex->params.w;
+        size[1] = ctx->hooked.tex->params.h;
+        return true;
+    }
+
+    if (pl_str_equals0(var, "NATIVE_CROPPED")) {
+        size[0] = fabs(pl_rect_w(params->src_rect));
+        size[1] = fabs(pl_rect_h(params->src_rect));
+        return true;
+    }
+
+    if (pl_str_equals0(var, "OUTPUT")) {
+        size[0] = abs(pl_rect_w(params->dst_rect));
+        size[1] = abs(pl_rect_h(params->dst_rect));
+        return true;
+    }
+
+    if (pl_str_equals0(var, "MAIN"))
+        var = pl_str0("MAINPRESUB");
+
+    for (int i = 0; i < p->pass_textures.num; i++) {
+        if (pl_str_equals(var, p->pass_textures.elem[i].name)) {
+            pl_tex tex = p->pass_textures.elem[i].tex;
+            size[0] = tex->params.w;
+            size[1] = tex->params.h;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static bool lookup_var(struct hook_ctx *ctx, pl_str var, float *val)
+{
+    struct hook_priv *p = ctx->priv;
+    for (int i = 0; i < p->hook_params.num; i++) {
+        const struct pl_hook_par *hp = &p->hook_params.elem[i];
+        if (pl_str_equals0(var, hp->name)) {
+            switch (hp->type) {
+            case PL_VAR_SINT:  *val = hp->data->i; return true;
+            case PL_VAR_UINT:  *val = hp->data->u; return true;
+            case PL_VAR_FLOAT: *val = hp->data->f; return true;
+            case PL_VAR_INVALID:
+            case PL_VAR_TYPE_COUNT:
+                break;
+            }
+
+            pl_unreachable();
+        }
+
+        if (hp->names) {
+            for (int j = hp->minimum.i; j <= hp->maximum.i; j++) {
+                if (pl_str_equals0(var, hp->names[j])) {
+                    *val = j;
+                    return true;
+                }
+            }
+        }
+    }
+
+    PL_WARN(p, "Variable '%.*s' not found in RPN expression!", PL_STR_FMT(var));
+    return false;
+}
+
+// Returns whether successful. 'result' is left untouched on failure
+static bool eval_shexpr(struct hook_ctx *ctx,
+                        const struct shexp expr[MAX_SHEXP_SIZE],
+                        float *result)
+{
+    struct hook_priv *p = ctx->priv;
+    float stack[MAX_SHEXP_SIZE] = {0};
+    int idx = 0; // points to next element to push
+
+    for (int i = 0; i < MAX_SHEXP_SIZE; i++) {
+        switch (expr[i].tag) {
+        case SHEXP_END:
+            goto done;
+
+        case SHEXP_CONST:
+            // Since our SHEXPs are bound by MAX_SHEXP_SIZE, it should be
+            // impossible to overflow the stack
+            assert(idx < MAX_SHEXP_SIZE);
+            stack[idx++] = expr[i].val.cval;
+            continue;
+
+        case SHEXP_OP1:
+            if (idx < 1) {
+                PL_WARN(p, "Stack underflow in RPN expression!");
+                return false;
+            }
+
+            switch (expr[i].val.op) {
+            case SHEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+            default: pl_unreachable();
+            }
+            continue;
+
+        case SHEXP_OP2:
+            if (idx < 2) {
+                PL_WARN(p, "Stack underflow in RPN expression!");
+                return false;
+            }
+
+            // Pop the operands in reverse order
+            float op2 = stack[--idx];
+            float op1 = stack[--idx];
+            float res = 0.0;
+            switch (expr[i].val.op) {
+            case SHEXP_OP_ADD: res = op1 + op2; break;
+            case SHEXP_OP_SUB: res = op1 - op2; break;
+            case SHEXP_OP_MUL: res = op1 * op2; break;
+            case SHEXP_OP_DIV: res = op1 / op2; break;
+            case SHEXP_OP_MOD: res = fmodf(op1, op2); break;
+            case SHEXP_OP_GT:  res = op1 > op2; break;
+            case SHEXP_OP_LT:  res = op1 < op2; break;
+            case SHEXP_OP_EQ:  res = fabsf(op1 - op2) <= 1e-6 * fmaxf(op1, op2); break;
+            case SHEXP_OP_NOT: pl_unreachable();
+            }
+
+            if (!isfinite(res)) {
+                PL_WARN(p, "Illegal operation in RPN expression!");
+                return false;
+            }
+
+            stack[idx++] = res;
+            continue;
+
+        case SHEXP_TEX_W:
+        case SHEXP_TEX_H: {
+            pl_str name = expr[i].val.varname;
+            float size[2];
+
+            if (!lookup_tex(ctx, name, size)) {
+                PL_WARN(p, "Variable '%.*s' not found in RPN expression!",
+                        PL_STR_FMT(name));
+                return false;
+            }
+
+            stack[idx++] = (expr[i].tag == SHEXP_TEX_W) ? size[0] : size[1];
+            continue;
+        }
+
+        case SHEXP_VAR: {
+            pl_str name = expr[i].val.varname;
+            float val;
+            if (!lookup_var(ctx, name, &val))
+                return false;
+            stack[idx++] = val;
+            continue;
+        }
+        }
+    }
+
+done:
+    // Return the single stack element
+    if (idx != 1) {
+        PL_WARN(p, "Malformed stack after RPN expression!");
+        return false;
+    }
+
+    *result = stack[0];
+    return true;
+}
+
+static double prng_step(uint64_t s[4])
+{
+    const uint64_t result = s[0] + s[3];
+    const uint64_t t = s[1] << 17;
+
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+
+    s[2] ^= t;
+    s[3] = (s[3] << 45) | (s[3] >> (64 - 45));
+    return (result >> 11) * 0x1.0p-53;
+}
+
+static bool bind_pass_tex(pl_shader sh, pl_str name,
+                          const struct pass_tex *ptex,
+                          const pl_rect2df *rect,
+                          bool hooked, bool mainpresub)
+{
+    ident_t id, pos, pt;
+
+    // Compatibility with mpv texture binding semantics
+    id = sh_bind(sh, ptex->tex, PL_TEX_ADDRESS_CLAMP, PL_TEX_SAMPLE_LINEAR,
+                 "hook_tex", rect, &pos, &pt);
+    if (!id)
+        return false;
+
+    GLSLH("#define %.*s_raw "$" \n", PL_STR_FMT(name), id);
+    GLSLH("#define %.*s_pos "$" \n", PL_STR_FMT(name), pos);
+    GLSLH("#define %.*s_map "$"_map \n", PL_STR_FMT(name), pos);
+    GLSLH("#define %.*s_size vec2(textureSize("$", 0)) \n", PL_STR_FMT(name), id);
+    GLSLH("#define %.*s_pt "$" \n", PL_STR_FMT(name), pt);
+
+    float off[2] = { ptex->rect.x0, ptex->rect.y0 };
+    GLSLH("#define %.*s_off "$" \n", PL_STR_FMT(name),
+          sh_var(sh, (struct pl_shader_var) {
+              .var = pl_var_vec2("offset"),
+              .data = off,
+    }));
+
+    struct pl_color_repr repr = ptex->repr;
+    ident_t scale = SH_FLOAT(pl_color_repr_normalize(&repr));
+    GLSLH("#define %.*s_mul "$" \n", PL_STR_FMT(name), scale);
+
+    // Compatibility with mpv
+    GLSLH("#define %.*s_rot mat2(1.0, 0.0, 0.0, 1.0) \n", PL_STR_FMT(name));
+
+    // Sampling function boilerplate
+    GLSLH("#define %.*s_tex(pos) ("$" * vec4(textureLod("$", pos, 0.0))) \n",
+          PL_STR_FMT(name), scale, id);
+    GLSLH("#define %.*s_texOff(off) (%.*s_tex("$" + "$" * vec2(off))) \n",
+          PL_STR_FMT(name), PL_STR_FMT(name), pos, pt);
+
+    bool can_gather = ptex->tex->params.format->gatherable;
+    if (can_gather) {
+        GLSLH("#define %.*s_gather(pos, c) ("$" * vec4(textureGather("$", pos, c))) \n",
+              PL_STR_FMT(name), scale, id);
+    }
+
+    if (hooked) {
+        GLSLH("#define HOOKED_raw %.*s_raw \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_pos %.*s_pos \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_size %.*s_size \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_rot %.*s_rot \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_off %.*s_off \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_pt %.*s_pt \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_map %.*s_map \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_mul %.*s_mul \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_tex %.*s_tex \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_texOff %.*s_texOff \n", PL_STR_FMT(name));
+        if (can_gather)
+            GLSLH("#define HOOKED_gather %.*s_gather \n", PL_STR_FMT(name));
+    }
+
+    if (mainpresub) {
+        GLSLH("#define MAIN_raw MAINPRESUB_raw \n");
+        GLSLH("#define MAIN_pos MAINPRESUB_pos \n");
+        GLSLH("#define MAIN_size MAINPRESUB_size \n");
+        GLSLH("#define MAIN_rot MAINPRESUB_rot \n");
+        GLSLH("#define MAIN_off MAINPRESUB_off \n");
+        GLSLH("#define MAIN_pt MAINPRESUB_pt \n");
+        GLSLH("#define MAIN_map MAINPRESUB_map \n");
+        GLSLH("#define MAIN_mul MAINPRESUB_mul \n");
+        GLSLH("#define MAIN_tex MAINPRESUB_tex \n");
+        GLSLH("#define MAIN_texOff MAINPRESUB_texOff \n");
+        if (can_gather)
+            GLSLH("#define MAIN_gather MAINPRESUB_gather \n");
+    }
+
+    return true;
+}
+
+static void save_pass_tex(struct hook_priv *p, struct pass_tex ptex)
+{
+
+    for (int i = 0; i < p->pass_textures.num; i++) {
+        if (!pl_str_equals(p->pass_textures.elem[i].name, ptex.name))
+            continue;
+
+        p->pass_textures.elem[i] = ptex;
+        return;
+    }
+
+    // No texture with this name yet, append new one
+    PL_ARRAY_APPEND(p->alloc, p->pass_textures, ptex);
+}
+
+static struct pl_hook_res hook_hook(void *priv, const struct pl_hook_params *params)
+{
+    struct hook_priv *p = priv;
+    pl_str stage = pl_stage_to_mp(params->stage);
+    struct pl_hook_res res = {0};
+
+    pl_shader sh = NULL;
+    struct hook_ctx ctx = {
+        .priv = p,
+        .params = params,
+        .hooked = {
+            .name  = stage,
+            .tex   = params->tex,
+            .rect  = params->rect,
+            .repr  = params->repr,
+            .color = params->color,
+            .comps = params->components,
+        },
+    };
+
+    // Save the input texture if needed
+    if (p->save_stages & params->stage) {
+        PL_TRACE(p, "Saving input texture '%.*s' for binding",
+                 PL_STR_FMT(ctx.hooked.name));
+        save_pass_tex(p, ctx.hooked);
+    }
+
+    for (int n = 0; n < p->hook_passes.num; n++) {
+        const struct hook_pass *pass = &p->hook_passes.elem[n];
+        if (!(pass->exec_stages & params->stage))
+            continue;
+
+        const struct custom_shader_hook *hook = &pass->hook;
+        PL_TRACE(p, "Executing hook pass %d on stage '%.*s': %.*s",
+                 n, PL_STR_FMT(stage), PL_STR_FMT(hook->pass_desc));
+
+        // Test for execution condition
+        float run = 0;
+        if (!eval_shexpr(&ctx, hook->cond, &run))
+            goto error;
+
+        if (!run) {
+            PL_TRACE(p, "Skipping hook due to condition");
+            continue;
+        }
+
+        // Generate a new shader object
+        sh = pl_dispatch_begin(params->dispatch);
+
+        // Bind all necessary input textures
+        for (int i = 0; i < PL_ARRAY_SIZE(hook->bind_tex); i++) {
+            pl_str texname = hook->bind_tex[i];
+            if (!texname.len)
+                break;
+
+            // Convenience alias, to allow writing shaders that are oblivious
+            // of the exact stage they hooked. This simply translates to
+            // whatever stage actually fired the hook.
+            bool hooked = false, mainpresub = false;
+            if (pl_str_equals0(texname, "HOOKED")) {
+                // Continue with binding this, under the new name
+                texname = stage;
+                hooked = true;
+            }
+
+            // Compatibility alias, because MAIN and MAINPRESUB mean the same
+            // thing to libplacebo, but user shaders are still written as
+            // though they can be different concepts.
+            if (pl_str_equals0(texname, "MAIN") ||
+                pl_str_equals0(texname, "MAINPRESUB"))
+            {
+                texname = pl_str0("MAINPRESUB");
+                mainpresub = true;
+            }
+
+            for (int j = 0; j < p->descriptors.num; j++) {
+                if (pl_str_equals0(texname, p->descriptors.elem[j].desc.name)) {
+                    // Directly bind this, no need to bother with all the
+                    // `bind_pass_tex` boilerplate
+                    ident_t id = sh_desc(sh, p->descriptors.elem[j]);
+                    GLSLH("#define %.*s "$" \n", PL_STR_FMT(texname), id);
+
+                    if (p->descriptors.elem[j].desc.type == PL_DESC_SAMPLED_TEX) {
+                        GLSLH("#define %.*s_tex(pos) (textureLod("$", pos, 0.0)) \n",
+                              PL_STR_FMT(texname), id);
+                    }
+                    goto next_bind;
+                }
+            }
+
+            for (int j = 0; j < p->pass_textures.num; j++) {
+                if (pl_str_equals(texname, p->pass_textures.elem[j].name)) {
+                    // Note: We bind the whole texture, rather than
+                    // hooked.rect, because user shaders in general are not
+                    // designed to handle cropped input textures.
+                    const struct pass_tex *ptex = &p->pass_textures.elem[j];
+                    pl_rect2df rect = {
+                        0, 0, ptex->tex->params.w, ptex->tex->params.h,
+                    };
+
+                    if (hook->offset_align && pl_str_equals(texname, stage)) {
+                        float sx = pl_rect_w(ctx.hooked.rect) / pl_rect_w(params->src_rect),
+                              sy = pl_rect_h(ctx.hooked.rect) / pl_rect_h(params->src_rect),
+                              ox = ctx.hooked.rect.x0 - sx * params->src_rect.x0,
+                              oy = ctx.hooked.rect.y0 - sy * params->src_rect.y0;
+
+                        PL_TRACE(p, "Aligning plane with ref: %f %f", ox, oy);
+                        pl_rect2df_offset(&rect, ox, oy);
+                    }
+
+                    if (!bind_pass_tex(sh, texname, &p->pass_textures.elem[j],
+                                       &rect, hooked, mainpresub))
+                    {
+                        goto error;
+                    }
+                    goto next_bind;
+                }
+            }
+
+            // If none of the above matched, this is an unknown texture name,
+            // so silently ignore this pass to match the mpv behavior
+            PL_TRACE(p, "Skipping hook due to no texture named '%.*s'.",
+                     PL_STR_FMT(texname));
+            pl_dispatch_abort(params->dispatch, &sh);
+            goto next_pass;
+
+    next_bind: ; // outer 'continue'
+        }
+
+        // Set up the input variables
+        p->frame_count++;
+        GLSLH("#define frame "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_int("frame"),
+            .data = &p->frame_count,
+            .dynamic = true,
+        }));
+
+        float random = prng_step(p->prng_state);
+        GLSLH("#define random "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("random"),
+            .data = &random,
+            .dynamic = true,
+        }));
+
+        float src_size[2] = { pl_rect_w(params->src_rect), pl_rect_h(params->src_rect) };
+        GLSLH("#define input_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_vec2("input_size"),
+            .data = src_size,
+        }));
+
+        float dst_size[2] = { pl_rect_w(params->dst_rect), pl_rect_h(params->dst_rect) };
+        GLSLH("#define target_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_vec2("target_size"),
+            .data = dst_size,
+        }));
+
+        float tex_off[2] = { params->src_rect.x0, params->src_rect.y0 };
+        GLSLH("#define tex_offset "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_vec2("tex_offset"),
+            .data = tex_off,
+        }));
+
+        // Custom parameters
+        for (int i = 0; i < p->hook_params.num; i++) {
+            const struct pl_hook_par *hp = &p->hook_params.elem[i];
+            switch (hp->mode) {
+            case PL_HOOK_PAR_VARIABLE:
+            case PL_HOOK_PAR_DYNAMIC:
+                GLSLH("#define %s "$" \n", hp->name,
+                      sh_var(sh, (struct pl_shader_var) {
+                        .var = {
+                            .name = hp->name,
+                            .type = hp->type,
+                            .dim_v = 1,
+                            .dim_m = 1,
+                            .dim_a = 1,
+                        },
+                        .data = hp->data,
+                        .dynamic = hp->mode == PL_HOOK_PAR_DYNAMIC,
+                }));
+                break;
+
+            case PL_HOOK_PAR_CONSTANT:
+                GLSLH("#define %s "$" \n", hp->name,
+                      sh_const(sh, (struct pl_shader_const) {
+                        .name = hp->name,
+                        .type = hp->type,
+                        .data = hp->data,
+                        .compile_time = true,
+                }));
+                break;
+
+            case PL_HOOK_PAR_DEFINE:
+                GLSLH("#define %s %d \n", hp->name, hp->data->i);
+                break;
+
+            case PL_HOOK_PAR_MODE_COUNT:
+                pl_unreachable();
+            }
+
+            if (hp->names) {
+                for (int j = hp->minimum.i; j <= hp->maximum.i; j++)
+                    GLSLH("#define %s %d \n", hp->names[j], j);
+            }
+        }
+
+        // Helper sub-shaders
+        uint64_t sh_id = SH_PARAMS(sh).id;
+        pl_shader_reset(p->trc_helper, pl_shader_params(
+            .id = ++sh_id,
+            .gpu = p->gpu,
+        ));
+        pl_shader_linearize(p->trc_helper, params->orig_color);
+        GLSLH("#define linearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+        pl_shader_reset(p->trc_helper, pl_shader_params(
+            .id = ++sh_id,
+            .gpu = p->gpu,
+        ));
+        pl_shader_delinearize(p->trc_helper, params->orig_color);
+        GLSLH("#define delinearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+        // Load and run the user shader itself
+        sh_append_str(sh, SH_BUF_HEADER, hook->pass_body);
+        sh_describef(sh, "%.*s", PL_STR_FMT(hook->pass_desc));
+
+        // Resolve output size and create framebuffer
+        float out_size[2] = {0};
+        if (!eval_shexpr(&ctx, hook->width,  &out_size[0]) ||
+            !eval_shexpr(&ctx, hook->height, &out_size[1]))
+        {
+            goto error;
+        }
+
+        int out_w = roundf(out_size[0]),
+            out_h = roundf(out_size[1]);
+
+        if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+            goto error;
+
+        // Generate a new texture to store the render result
+        pl_tex fbo;
+        fbo = params->get_tex(params->priv, out_w, out_h);
+        if (!fbo) {
+            PL_ERR(p, "Failed dispatching hook: `get_tex` callback failed?");
+            goto error;
+        }
+
+        bool ok;
+        if (hook->is_compute) {
+
+            if (!sh_try_compute(sh, hook->threads_w, hook->threads_h, false, 0) ||
+                !fbo->params.storable)
+            {
+                PL_ERR(p, "Failed dispatching COMPUTE shader");
+                goto error;
+            }
+
+            GLSLP("#define out_image "$" \n", sh_desc(sh, (struct pl_shader_desc) {
+                .binding.object = fbo,
+                .desc = {
+                    .name = "out_image",
+                    .type = PL_DESC_STORAGE_IMG,
+                    .access = PL_DESC_ACCESS_WRITEONLY,
+                },
+            }));
+
+            sh->output = PL_SHADER_SIG_NONE;
+
+            GLSL("hook(); \n");
+            ok = pl_dispatch_compute(params->dispatch, pl_dispatch_compute_params(
+                .shader = &sh,
+                .dispatch_size = {
+                    // Round up as many blocks as are needed to cover the image
+                    PL_DIV_UP(out_w, hook->block_w),
+                    PL_DIV_UP(out_h, hook->block_h),
+                    1,
+                },
+                .width  = out_w,
+                .height = out_h,
+            ));
+
+        } else {
+
+            // Default non-COMPUTE shaders to explicitly use fragment shaders
+            // only, to avoid breaking things like fwidth()
+            sh->type = PL_DEF(sh->type, SH_FRAGMENT);
+
+            GLSL("vec4 color = hook(); \n");
+            ok = pl_dispatch_finish(params->dispatch, pl_dispatch_params(
+                .shader = &sh,
+                .target = fbo,
+            ));
+
+        }
+
+        if (!ok)
+            goto error;
+
+        float sx = (float) out_w / ctx.hooked.tex->params.w,
+              sy = (float) out_h / ctx.hooked.tex->params.h,
+              x0 = sx * ctx.hooked.rect.x0 + hook->offset[0],
+              y0 = sy * ctx.hooked.rect.y0 + hook->offset[1];
+
+        pl_rect2df new_rect = {
+            x0,
+            y0,
+            x0 + sx * pl_rect_w(ctx.hooked.rect),
+            y0 + sy * pl_rect_h(ctx.hooked.rect),
+        };
+
+        if (hook->offset_align) {
+            float rx = pl_rect_w(new_rect) / pl_rect_w(params->src_rect),
+                  ry = pl_rect_h(new_rect) / pl_rect_h(params->src_rect),
+                  ox = rx * params->src_rect.x0 - sx * ctx.hooked.rect.x0,
+                  oy = ry * params->src_rect.y0 - sy * ctx.hooked.rect.y0;
+
+            pl_rect2df_offset(&new_rect, ox, oy);
+        }
+
+        // Save the result of this shader invocation
+        struct pass_tex ptex = {
+            .name  = hook->save_tex.len ? hook->save_tex : stage,
+            .tex   = fbo,
+            .repr  = ctx.hooked.repr,
+            .color = ctx.hooked.color,
+            .comps = PL_DEF(hook->comps, ctx.hooked.comps),
+            .rect  = new_rect,
+        };
+
+        // It's assumed that users will correctly normalize the input
+        pl_color_repr_normalize(&ptex.repr);
+
+        PL_TRACE(p, "Saving output texture '%.*s' from hook execution on '%.*s'",
+                 PL_STR_FMT(ptex.name), PL_STR_FMT(stage));
+
+        save_pass_tex(p, ptex);
+
+        // Update the result object, unless we saved to a different name
+        if (pl_str_equals(ptex.name, stage)) {
+            ctx.hooked = ptex;
+            res = (struct pl_hook_res) {
+                .output     = PL_HOOK_SIG_TEX,
+                .tex        = fbo,
+                .repr       = ptex.repr,
+                .color      = ptex.color,
+                .components = ptex.comps,
+                .rect       = new_rect,
+            };
+        }
+
+next_pass: ;
+    }
+
+    return res;
+
+error:
+    pl_dispatch_abort(params->dispatch, &sh);
+    return (struct pl_hook_res) { .failed = true };
+}
+
+const struct pl_hook *pl_mpv_user_shader_parse(pl_gpu gpu,
+                                               const char *shader_text,
+                                               size_t shader_len)
+{
+    if (!shader_len)
+        return NULL;
+
+    pl_str shader = { (uint8_t *) shader_text, shader_len };
+
+    struct pl_hook *hook = pl_zalloc_obj(NULL, hook, struct hook_priv);
+    struct hook_priv *p = PL_PRIV(hook);
+
+    *hook = (struct pl_hook) {
+        .input = PL_HOOK_SIG_TEX,
+        .priv = p,
+        .reset = hook_reset,
+        .hook = hook_hook,
+        .signature = pl_str_hash(shader),
+    };
+
+    *p = (struct hook_priv) {
+        .log = gpu->log,
+        .gpu = gpu,
+        .alloc = hook,
+        .trc_helper = pl_shader_alloc(gpu->log, NULL),
+        .prng_state = {
+            // Determined by fair die roll
+            0xb76d71f9443c228allu, 0x93a02092fc4807e8llu,
+            0x06d81748f838bd07llu, 0x9381ee129dddce6cllu,
+        },
+    };
+
+    shader = pl_strdup(hook, shader);
+
+    // Skip all garbage (e.g. comments) before the first header
+    int pos = pl_str_find(shader, pl_str0("//!"));
+    if (pos < 0) {
+        PL_ERR(gpu, "Shader appears to contain no headers?");
+        goto error;
+    }
+    shader = pl_str_drop(shader, pos);
+
+    // Loop over the file
+    while (shader.len > 0)
+    {
+        // Peek at the first header to dispatch the right type
+        if (pl_str_startswith0(shader, "//!TEXTURE")) {
+            struct pl_shader_desc sd;
+            if (!parse_tex(gpu, hook, &shader, &sd))
+                goto error;
+
+            PL_INFO(gpu, "Registering named texture '%s'", sd.desc.name);
+            PL_ARRAY_APPEND(hook, p->descriptors, sd);
+            continue;
+        }
+
+        if (pl_str_startswith0(shader, "//!BUFFER")) {
+            struct pl_shader_desc sd;
+            if (!parse_buf(gpu, hook, &shader, &sd))
+                goto error;
+
+            PL_INFO(gpu, "Registering named buffer '%s'", sd.desc.name);
+            PL_ARRAY_APPEND(hook, p->descriptors, sd);
+            continue;
+        }
+
+        if (pl_str_startswith0(shader, "//!PARAM")) {
+            struct pl_hook_par hp;
+            if (!parse_param(gpu->log, hook, &shader, &hp))
+                goto error;
+
+            PL_INFO(gpu, "Registering named parameter '%s'", hp.name);
+            PL_ARRAY_APPEND(hook, p->hook_params, hp);
+            continue;
+        }
+
+        struct custom_shader_hook h;
+        if (!parse_hook(gpu->log, &shader, &h))
+            goto error;
+
+        struct hook_pass pass = {
+            .exec_stages = 0,
+            .hook = h,
+        };
+
+        for (int i = 0; i < PL_ARRAY_SIZE(h.hook_tex); i++)
+            pass.exec_stages |= mp_stage_to_pl(h.hook_tex[i]);
+        for (int i = 0; i < PL_ARRAY_SIZE(h.bind_tex); i++) {
+            p->save_stages |= mp_stage_to_pl(h.bind_tex[i]);
+            if (pl_str_equals0(h.bind_tex[i], "HOOKED"))
+                p->save_stages |= pass.exec_stages;
+        }
+
+        // As an extra precaution, this avoids errors when trying to run
+        // conditions against planes that were never hooked. As a sole
+        // exception, OUTPUT is special because it's hard-coded to return the
+        // dst_rect even before it was hooked. (This is an apparently
+        // undocumented mpv quirk, but shaders rely on it in practice)
+        enum pl_hook_stage rpn_stages = 0;
+        for (int i = 0; i < PL_ARRAY_SIZE(h.width); i++) {
+            if (h.width[i].tag == SHEXP_TEX_W || h.width[i].tag == SHEXP_TEX_H)
+                rpn_stages |= mp_stage_to_pl(h.width[i].val.varname);
+        }
+        for (int i = 0; i < PL_ARRAY_SIZE(h.height); i++) {
+            if (h.height[i].tag == SHEXP_TEX_W || h.height[i].tag == SHEXP_TEX_H)
+                rpn_stages |= mp_stage_to_pl(h.height[i].val.varname);
+        }
+        for (int i = 0; i < PL_ARRAY_SIZE(h.cond); i++) {
+            if (h.cond[i].tag == SHEXP_TEX_W || h.cond[i].tag == SHEXP_TEX_H)
+                rpn_stages |= mp_stage_to_pl(h.cond[i].val.varname);
+        }
+
+        p->save_stages |= rpn_stages & ~PL_HOOK_OUTPUT;
+
+        PL_INFO(gpu, "Registering hook pass: %.*s", PL_STR_FMT(h.pass_desc));
+        PL_ARRAY_APPEND(hook, p->hook_passes, pass);
+    }
+
+    // We need to hook on both the exec and save stages, so that we can keep
+    // track of any textures we might need
+    hook->stages |= p->save_stages;
+    for (int i = 0; i < p->hook_passes.num; i++)
+        hook->stages |= p->hook_passes.elem[i].exec_stages;
+
+    hook->parameters = p->hook_params.elem;
+    hook->num_parameters = p->hook_params.num;
+
+    PL_MSG(gpu, PL_LOG_DEBUG, "Loaded user shader:");
+    pl_msg_source(gpu->log, PL_LOG_DEBUG, shader_text);
+
+    return hook;
+
+error:
+    pl_mpv_user_shader_destroy((const struct pl_hook **) &hook);
+    PL_MSG(gpu, PL_LOG_ERR, "Failed to parse user shader:");
+    pl_msg_source(gpu->log, PL_LOG_ERR, shader_text);
+    pl_log_stack_trace(gpu->log, PL_LOG_ERR);
+    return NULL;
+}
+
+void pl_mpv_user_shader_destroy(const struct pl_hook **hookp)
+{
+    const struct pl_hook *hook = *hookp;
+    if (!hook)
+        return;
+
+    struct hook_priv *p = PL_PRIV(hook);
+    for (int i = 0; i < p->descriptors.num; i++) {
+        switch (p->descriptors.elem[i].desc.type) {
+            case PL_DESC_BUF_UNIFORM:
+            case PL_DESC_BUF_STORAGE:
+            case PL_DESC_BUF_TEXEL_UNIFORM:
+            case PL_DESC_BUF_TEXEL_STORAGE: {
+                pl_buf buf = p->descriptors.elem[i].binding.object;
+                pl_buf_destroy(p->gpu, &buf);
+                break;
+            }
+
+            case PL_DESC_SAMPLED_TEX:
+            case PL_DESC_STORAGE_IMG: {
+                pl_tex tex = p->descriptors.elem[i].binding.object;
+                pl_tex_destroy(p->gpu, &tex);
+                break;
+
+            case PL_DESC_INVALID:
+            case PL_DESC_TYPE_COUNT:
+                pl_unreachable();
+            }
+        }
+    }
+
+    pl_shader_free(&p->trc_helper);
+    pl_free((void *) hook);
+    *hookp = NULL;
+}
diff --git a/src/shaders/deinterlacing.c b/src/shaders/deinterlacing.c
new file mode 100644
index 0000000..5c85138
--- /dev/null
+++ b/src/shaders/deinterlacing.c
@@ -0,0 +1,260 @@
+/*
+ * This file is part of libplacebo, but also based on vf_yadif_cuda.cu:
+ * Copyright (C) 2018 Philip Langdale <philipl@overt.org>
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/deinterlacing.h>
+
+const struct pl_deinterlace_params pl_deinterlace_default_params = { PL_DEINTERLACE_DEFAULTS };
+
+void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src,
+                           const struct pl_deinterlace_params *params)
+{
+    params = PL_DEF(params, &pl_deinterlace_default_params);
+
+    const struct pl_tex_params *texparams = &src->cur.top->params;
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, texparams->w, texparams->h))
+        return;
+
+    sh_describe(sh, "deinterlacing");
+    GLSL("vec4 color = vec4(0,0,0,1);   \n"
+         "// pl_shader_deinterlace      \n"
+         "{                             \n");
+
+    uint8_t comp_mask = PL_DEF(src->component_mask, 0xFu);
+    comp_mask &= (1u << texparams->format->num_components) - 1u;
+    if (!comp_mask) {
+        SH_FAIL(sh, "pl_shader_deinterlace: empty component mask?");
+        return;
+    }
+
+    const uint8_t num_comps = sh_num_comps(comp_mask);
+    const char *swiz = sh_swizzle(comp_mask);
+    GLSL("#define T %s \n", sh_float_type(comp_mask));
+
+    ident_t pos, pt;
+    ident_t cur = sh_bind(sh, src->cur.top, PL_TEX_ADDRESS_MIRROR,
+                          PL_TEX_SAMPLE_NEAREST, "cur", NULL, &pos, &pt);
+    if (!cur)
+        return;
+
+    GLSL("#define GET(TEX, X, Y)                              \\\n"
+         "    (textureLod(TEX, pos + pt * vec2(X, Y), 0.0).%s)  \n"
+         "vec2 pos = "$";                                       \n"
+         "vec2 pt  = "$";                                       \n"
+         "T res;                                                \n",
+         swiz, pos, pt);
+
+    if (src->field == PL_FIELD_NONE) {
+        GLSL("res = GET("$", 0, 0); \n", cur);
+        goto done;
+    }
+
+    // Don't modify the primary field
+    GLSL("int yh = textureSize("$", 0).y;   \n"
+         "int yo = int("$".y * float(yh));  \n"
+         "if (yo %% 2 == %d) {              \n"
+         "    res = GET("$", 0, 0);         \n"
+         "} else {                          \n",
+         cur, pos,
+         src->field == PL_FIELD_TOP ? 0 : 1,
+         cur);
+
+    switch (params->algo) {
+    case PL_DEINTERLACE_WEAVE:
+        GLSL("res = GET("$", 0, 0); \n", cur);
+        break;
+
+    case PL_DEINTERLACE_BOB:
+        GLSL("res = GET("$", 0, %d); \n", cur,
+             src->field == PL_FIELD_TOP ? -1 : 1);
+        break;
+
+
+    case PL_DEINTERLACE_YADIF: {
+        // Try using a compute shader for this, for the sole reason of
+        // optimizing for thread group synchronicity. Otherwise, because we
+        // alternate between lines output as-is and lines output deinterlaced,
+        // half of our thread group will be mostly idle at any point in time.
+        const int bw = PL_DEF(sh_glsl(sh).subgroup_size, 32);
+        sh_try_compute(sh, bw, 1, true, 0);
+
+        // This magic constant is hard-coded in the original implementation as
+        // '1' on an 8-bit scale. Since we work with arbitrary bit depth
+        // floating point textures, we have to convert this somehow. Hard-code
+        // it as 1/255 under the assumption that the original intent was to be
+        // roughly 1 unit of brightness increment on an 8-bit source. This may
+        // or may not produce suboptimal results on higher-bit-depth content.
+        static const float spatial_bias = 1 / 255.0f;
+
+        // Calculate spatial prediction
+        ident_t spatial_pred = sh_fresh(sh, "spatial_predictor");
+        GLSLH("float "$"(float a, float b, float c, float d, float e, float f, float g, \n"
+              "          float h, float i, float j, float k, float l, float m, float n) \n"
+              "{                                                                        \n"
+              "    float spatial_pred = (d + k) / 2.0;                                  \n"
+              "    float spatial_score = abs(c - j) + abs(d - k) + abs(e - l) - %f;     \n"
+
+              "    float score = abs(b - k) + abs(c - l) + abs(d - m);                  \n"
+              "    if (score < spatial_score) {                                         \n"
+              "        spatial_pred = (c + l) / 2.0;                                    \n"
+              "        spatial_score = score;                                           \n"
+              "        score = abs(a - l) + abs(b - m) + abs(c - n);                    \n"
+              "        if (score < spatial_score) {                                     \n"
+              "          spatial_pred = (b + m) / 2.0;                                  \n"
+              "          spatial_score = score;                                         \n"
+              "        }                                                                \n"
+              "    }                                                                    \n"
+              "    score = abs(d - i) + abs(e - j) + abs(f - k);                        \n"
+              "    if (score < spatial_score) {                                         \n"
+              "        spatial_pred = (e + j) / 2.0;                                    \n"
+              "        spatial_score = score;                                           \n"
+              "        score = abs(e - h) + abs(f - i) + abs(g - j);                    \n"
+              "        if (score < spatial_score) {                                     \n"
+              "          spatial_pred = (f + i) / 2.0;                                  \n"
+              "          spatial_score = score;                                         \n"
+              "        }                                                                \n"
+              "    }                                                                    \n"
+              "    return spatial_pred;                                                 \n"
+              "}                                                                        \n",
+              spatial_pred, spatial_bias);
+
+        GLSL("T a = GET("$", -3, -1); \n"
+             "T b = GET("$", -2, -1); \n"
+             "T c = GET("$", -1, -1); \n"
+             "T d = GET("$",  0, -1); \n"
+             "T e = GET("$", +1, -1); \n"
+             "T f = GET("$", +2, -1); \n"
+             "T g = GET("$", +3, -1); \n"
+             "T h = GET("$", -3, +1); \n"
+             "T i = GET("$", -2, +1); \n"
+             "T j = GET("$", -1, +1); \n"
+             "T k = GET("$",  0, +1); \n"
+             "T l = GET("$", +1, +1); \n"
+             "T m = GET("$", +2, +1); \n"
+             "T n = GET("$", +3, +1); \n",
+             cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur);
+
+        if (num_comps == 1) {
+            GLSL("res = "$"(a, b, c, d, e, f, g, h, i, j, k, l, m, n); \n", spatial_pred);
+        } else {
+            for (uint8_t i = 0; i < num_comps; i++) {
+                char c = "xyzw"[i];
+                GLSL("res.%c = "$"(a.%c, b.%c, c.%c, d.%c, e.%c, f.%c, g.%c,  \n"
+                     "             h.%c, i.%c, j.%c, k.%c, l.%c, m.%c, n.%c); \n",
+                     c, spatial_pred, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
+            }
+        }
+
+        // Calculate temporal prediction
+        ident_t temporal_pred = sh_fresh(sh, "temporal_predictor");
+        GLSLH("float "$"(float A, float B, float C, float D, float E, float F,  \n"
+              "          float G, float H, float I, float J, float K, float L,  \n"
+              "          float spatial_pred)                                    \n"
+              "{                                                                \n"
+              "    float p0 = (C + H) / 2.0;                                    \n"
+              "    float p1 = F;                                                \n"
+              "    float p2 = (D + I) / 2.0;                                    \n"
+              "    float p3 = G;                                                \n"
+              "    float p4 = (E + J) / 2.0;                                    \n"
+
+              "    float tdiff0 = abs(D - I) / 2.0;                             \n"
+              "    float tdiff1 = (abs(A - F) + abs(B - G)) / 2.0;              \n"
+              "    float tdiff2 = (abs(K - F) + abs(G - L)) / 2.0;              \n"
+              "    float diff = max(tdiff0, max(tdiff1, tdiff2));               \n",
+              temporal_pred);
+        if (!params->skip_spatial_check) {
+            GLSLH("float maxi = max(p2 - min(p3, p1), min(p0 - p1, p4 - p3));   \n"
+                  "float mini = min(p2 - max(p3, p1), max(p0 - p1, p4 - p3));   \n"
+                  "diff = max(diff, max(mini, -maxi));                          \n");
+        }
+        GLSLH("    if (spatial_pred > p2 + diff)                                \n"
+              "      spatial_pred = p2 + diff;                                  \n"
+              "    if (spatial_pred < p2 - diff)                                \n"
+              "      spatial_pred = p2 - diff;                                  \n"
+              "    return spatial_pred;                                         \n"
+              "}                                                                \n");
+
+        ident_t prev2 = cur, next2 = cur;
+        if (src->prev.top && src->prev.top != src->cur.top) {
+            pl_assert(src->prev.top->params.w == texparams->w);
+            pl_assert(src->prev.top->params.h == texparams->h);
+            prev2 = sh_bind(sh, src->prev.top, PL_TEX_ADDRESS_MIRROR,
+                            PL_TEX_SAMPLE_NEAREST, "prev", NULL, NULL, NULL);
+            if (!prev2)
+                return;
+        }
+
+        if (src->next.top && src->next.top != src->cur.top) {
+            pl_assert(src->next.top->params.w == texparams->w);
+            pl_assert(src->next.top->params.h == texparams->h);
+            next2 = sh_bind(sh, src->next.top, PL_TEX_ADDRESS_MIRROR,
+                            PL_TEX_SAMPLE_NEAREST, "next", NULL, NULL, NULL);
+            if (!next2)
+                return;
+        }
+
+        enum pl_field first_field = PL_DEF(src->first_field, PL_FIELD_TOP);
+        ident_t prev1 = src->field == first_field ? prev2 : cur;
+        ident_t next1 = src->field == first_field ? cur : next2;
+
+        GLSL("T A = GET("$", 0, -1); \n"
+             "T B = GET("$", 0,  1); \n"
+             "T C = GET("$", 0, -2); \n"
+             "T D = GET("$", 0,  0); \n"
+             "T E = GET("$", 0, +2); \n"
+             "T F = GET("$", 0, -1); \n"
+             "T G = GET("$", 0, +1); \n"
+             "T H = GET("$", 0, -2); \n"
+             "T I = GET("$", 0,  0); \n"
+             "T J = GET("$", 0, +2); \n"
+             "T K = GET("$", 0, -1); \n"
+             "T L = GET("$", 0, +1); \n",
+             prev2, prev2,
+             prev1, prev1, prev1,
+             cur, cur,
+             next1, next1, next1,
+             next2, next2);
+
+        if (num_comps == 1) {
+            GLSL("res = "$"(A, B, C, D, E, F, G, H, I, J, K, L, res); \n", temporal_pred);
+        } else {
+            for (uint8_t i = 0; i < num_comps; i++) {
+                char c = "xyzw"[i];
+                GLSL("res.%c = "$"(A.%c, B.%c, C.%c, D.%c, E.%c, F.%c, \n"
+                     "             G.%c, H.%c, I.%c, J.%c, K.%c, L.%c, \n"
+                     "             res.%c);                            \n",
+                     c, temporal_pred, c, c, c, c, c, c, c, c, c, c, c, c, c);
+            }
+        }
+        break;
+    }
+
+    case PL_DEINTERLACE_ALGORITHM_COUNT:
+        pl_unreachable();
+    }
+
+    GLSL("}\n"); // End of primary/secondary field branch
+
+done:
+    GLSL("color.%s = res;   \n"
+         "#undef T          \n"
+         "#undef GET        \n"
+         "}                 \n",
+         swiz);
+}
diff --git a/src/shaders/dithering.c b/src/shaders/dithering.c
new file mode 100644
index 0000000..4485d11
--- /dev/null
+++ b/src/shaders/dithering.c
@@ -0,0 +1,527 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/shaders/dithering.h>
+
+const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS };
+
+struct sh_dither_obj {
+    pl_shader_obj lut;
+};
+
+static void sh_dither_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_dither_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->lut);
+    *obj = (struct sh_dither_obj) {0};
+}
+
+static void fill_dither_matrix(void *data, const struct sh_lut_params *params)
+{
+    pl_assert(params->width > 0 && params->height > 0 && params->comps == 1);
+
+    const struct pl_dither_params *dpar = params->priv;
+    switch (dpar->method) {
+    case PL_DITHER_ORDERED_LUT:
+        pl_assert(params->width == params->height);
+        pl_generate_bayer_matrix(data, params->width);
+        return;
+
+    case PL_DITHER_BLUE_NOISE:
+        pl_assert(params->width == params->height);
+        pl_generate_blue_noise(data, params->width);
+        return;
+
+    case PL_DITHER_ORDERED_FIXED:
+    case PL_DITHER_WHITE_NOISE:
+    case PL_DITHER_METHOD_COUNT:
+        return;
+    }
+
+    pl_unreachable();
+}
+
+static bool dither_method_is_lut(enum pl_dither_method method)
+{
+    switch (method) {
+    case PL_DITHER_BLUE_NOISE:
+    case PL_DITHER_ORDERED_LUT:
+        return true;
+    case PL_DITHER_ORDERED_FIXED:
+    case PL_DITHER_WHITE_NOISE:
+        return false;
+    case PL_DITHER_METHOD_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static inline float approx_gamma(enum pl_color_transfer trc)
+{
+    switch (trc) {
+    case PL_COLOR_TRC_UNKNOWN:  return 1.0f;
+    case PL_COLOR_TRC_LINEAR:   return 1.0f;
+    case PL_COLOR_TRC_PRO_PHOTO:return 1.8f;
+    case PL_COLOR_TRC_GAMMA18:  return 1.8f;
+    case PL_COLOR_TRC_GAMMA20:  return 2.0f;
+    case PL_COLOR_TRC_GAMMA24:  return 2.4f;
+    case PL_COLOR_TRC_GAMMA26:  return 2.6f;
+    case PL_COLOR_TRC_ST428:    return 2.6f;
+    case PL_COLOR_TRC_GAMMA28:  return 2.8f;
+
+    case PL_COLOR_TRC_SRGB:
+    case PL_COLOR_TRC_BT_1886:
+    case PL_COLOR_TRC_GAMMA22:
+        return 2.2f;
+
+    case PL_COLOR_TRC_PQ:
+    case PL_COLOR_TRC_HLG:
+    case PL_COLOR_TRC_V_LOG:
+    case PL_COLOR_TRC_S_LOG1:
+    case PL_COLOR_TRC_S_LOG2:
+        return 2.0f; // TODO: handle this better
+
+    case PL_COLOR_TRC_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+void pl_shader_dither(pl_shader sh, int new_depth,
+                      pl_shader_obj *dither_state,
+                      const struct pl_dither_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    if (new_depth <= 0 || new_depth > 256) {
+        PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth);
+        return;
+    }
+
+    sh_describef(sh, "dithering (%d bits)", new_depth);
+    GLSL("// pl_shader_dither \n"
+        "{                    \n"
+        "float bias;          \n");
+
+    params = PL_DEF(params, &pl_dither_default_params);
+    if (params->lut_size < 0 || params->lut_size > 8) {
+        SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size);
+        return;
+    }
+
+    enum pl_dither_method method = params->method;
+    ident_t lut = NULL_IDENT;
+    int lut_size = 0;
+
+    if (dither_method_is_lut(method)) {
+        if (!dither_state) {
+            PL_WARN(sh, "LUT-based dither method specified but no dither state "
+                    "object given, falling back to non-LUT based methods.");
+            goto fallback;
+        }
+
+        struct sh_dither_obj *obj;
+        obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER,
+                     struct sh_dither_obj, sh_dither_uninit);
+        if (!obj)
+            goto fallback;
+
+        bool cache = method == PL_DITHER_BLUE_NOISE;
+        lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size);
+        lut = sh_lut(sh, sh_lut_params(
+            .object     = &obj->lut,
+            .var_type   = PL_VAR_FLOAT,
+            .width      = lut_size,
+            .height     = lut_size,
+            .comps      = 1,
+            .fill       = fill_dither_matrix,
+            .signature  = (CACHE_KEY_DITHER ^ method) * lut_size,
+            .cache      = cache ? SH_CACHE(sh) : NULL,
+            .priv       = (void *) params,
+        ));
+        if (!lut)
+            goto fallback;
+    }
+
+    goto done;
+
+fallback:
+    method = PL_DITHER_ORDERED_FIXED;
+    // fall through
+
+done: ;
+
+    int size = 0;
+    if (lut) {
+        size = lut_size;
+    } else if (method == PL_DITHER_ORDERED_FIXED) {
+        size = 16; // hard-coded size
+    }
+
+    if (size) {
+        // Transform the screen position to the cyclic range [0,1)
+        GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size));
+
+        if (params->temporal) {
+            int phase = SH_PARAMS(sh).index % 8;
+            float r = phase * (M_PI / 2); // rotate
+            float m = phase < 4 ? 1 : -1; // mirror
+            float mat[2][2] = {
+                {cos(r),     -sin(r)    },
+                {sin(r) * m,  cos(r) * m},
+            };
+
+            ident_t rot = sh_var(sh, (struct pl_shader_var) {
+                .var  = pl_var_mat2("dither_rot"),
+                .data = &mat[0][0],
+                .dynamic = true,
+            });
+            GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot);
+        }
+    }
+
+    switch (method) {
+    case PL_DITHER_WHITE_NOISE: {
+        ident_t prng = sh_prng(sh, params->temporal, NULL);
+        GLSL("bias = "$".x;\n", prng);
+        break;
+    }
+
+    case PL_DITHER_ORDERED_FIXED:
+        // Bitwise ordered dither using only 32-bit uints
+        GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u;     \n"
+             // Bitwise merge (morton number)
+             "xy.x = xy.x ^ xy.y;                      \n"
+             "xy = (xy | xy << 2) & uvec2(0x33333333); \n"
+             "xy = (xy | xy << 1) & uvec2(0x55555555); \n"
+             // Bitwise inversion
+             "uint b = xy.x + (xy.y << 1);             \n"
+             "b = (b * 0x0802u & 0x22110u) |           \n"
+             "    (b * 0x8020u & 0x88440u);            \n"
+             "b = 0x10101u * b;                        \n"
+             "b = (b >> 16) & 0xFFu;                   \n"
+             // Generate bias value
+             "bias = float(b) * 1.0/256.0;             \n");
+        break;
+
+    case PL_DITHER_BLUE_NOISE:
+    case PL_DITHER_ORDERED_LUT:
+        pl_assert(lut);
+        GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size));
+        break;
+
+    case PL_DITHER_METHOD_COUNT:
+        pl_unreachable();
+    }
+
+    // Scale factor for dither rounding
+    GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1);
+
+    const float gamma = approx_gamma(params->transfer);
+    if (gamma != 1.0f && new_depth <= 4) {
+        GLSL("const float gamma = "$";                  \n"
+             "vec4 color_lin = pow(color, vec4(gamma)); \n",
+             SH_FLOAT(gamma));
+
+        if (new_depth == 1) {
+            // Special case for bit depth 1 dithering, in this case we can just
+            // ignore the low/high rounding because we know we are always
+            // dithering between 0.0 and 1.0.
+            GLSL("const vec4 low = vec4(0.0);           \n"
+                 "const vec4 high = vec4(1.0);          \n"
+                 "vec4 offset = color_lin;              \n");
+        } else {
+            // Linearize the low, high and current color values
+            GLSL("vec4 low = floor(color * scale) / scale;  \n"
+                 "vec4 high = ceil(color * scale) / scale;  \n"
+                 "vec4 low_lin = pow(low, vec4(gamma));     \n"
+                 "vec4 high_lin = pow(high, vec4(gamma));   \n"
+                 "vec4 range = high_lin - low_lin;          \n"
+                 "vec4 offset = (color_lin - low_lin) /     \n"
+                 "              max(range, 1e-6);           \n");
+        }
+
+        // Mix in the correct ratio corresponding to the offset and bias
+        GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n");
+    } else {
+        // Approximate each gamma segment as a straight line, this simplifies
+        // the process of dithering down to a single scale and (biased) round.
+        GLSL("color = scale * color + vec4(bias);   \n"
+             "color = floor(color) * (1.0 / scale); \n");
+    }
+
+    GLSL("} \n");
+}
+
+/* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
+// will be affected by the current column.
+static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k)
+{
+    int ret = 0;
+    for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+        for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+            if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) {
+                int shifted_x = x + y * k->shift;
+
+                // The shift mapping guarantees current column (or left of it)
+                // won't be affected by error diffusion.
+                assert(shifted_x > 0);
+
+                ret = PL_MAX(ret, shifted_x);
+            }
+        }
+    }
+    return ret;
+}
+
+size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel,
+                                    int height)
+{
+    // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors
+    // propagated out from bottom side.
+    int rows = height + PL_EDF_MAX_DY;
+    int shifted_columns = compute_rightmost_shifted_column(kernel) + 1;
+
+    // The shared memory is an array of size rows*shifted_columns. Each element
+    // is a single uint for three RGB component.
+    return rows * shifted_columns * sizeof(uint32_t);
+}
+
+bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params)
+{
+    const int width = params->input_tex->params.w, height = params->input_tex->params.h;
+    const struct pl_glsl_version glsl = sh_glsl(sh);
+    const struct pl_error_diffusion_kernel *kernel =
+        PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite);
+
+    pl_assert(params->output_tex->params.w == width);
+    pl_assert(params->output_tex->params.h == height);
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height))
+        return false;
+
+    if (params->new_depth <= 0 || params->new_depth > 256) {
+        PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth);
+        return false;
+    }
+
+    // The parallel error diffusion works by applying the shift mapping first.
+    // Taking the Floyd and Steinberg algorithm for example. After applying
+    // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
+    // propagated into the next few columns, which makes parallel processing on
+    // the same column possible.
+    //
+    //           X    7/16                X    7/16
+    //    3/16  5/16  1/16   ==>    0     0    3/16  5/16  1/16
+
+    // Figuring out the size of rectangle containing all shifted pixels.
+    // The rectangle height is not changed.
+    int shifted_width = width + (height - 1) * kernel->shift;
+
+    // We process all pixels from the shifted rectangles column by column, with
+    // a single global work group of size |block_size|.
+    // Figuring out how many block are required to process all pixels. We need
+    // this explicitly to make the number of barrier() calls match.
+    int block_size = PL_MIN(glsl.max_group_threads, height);
+    int blocks = PL_DIV_UP(height * shifted_width, block_size);
+
+    // If we figure out how many of the next columns will be affected while the
+    // current columns is being processed. We can store errors of only a few
+    // columns in the shared memory. Using a ring buffer will further save the
+    // cost while iterating to next column.
+    //
+    int ring_buffer_rows = height + PL_EDF_MAX_DY;
+    int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1;
+    ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) {
+        .type = PL_VAR_UINT,
+        .name = "ring_buffer_size",
+        .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns },
+        .compile_time = true,
+    });
+
+    // Compute shared memory requirements and try enabling compute shader.
+    size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t);
+    if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) {
+        PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or "
+               "insufficient compute shader memory!");
+        return false;
+    }
+
+    ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->input_tex,
+        .desc = {
+            .name   = "input_tex",
+            .type   = PL_DESC_SAMPLED_TEX,
+        },
+    });
+
+    ident_t out_img = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->output_tex,
+        .desc = {
+            .name    = "output_tex",
+            .type    = PL_DESC_STORAGE_IMG,
+            .access  = PL_DESC_ACCESS_WRITEONLY,
+        },
+    });
+
+    sh->output = PL_SHADER_SIG_NONE;
+    sh_describef(sh, "error diffusion (%s, %d bits)",
+                 kernel->name, params->new_depth);
+
+    // Defines the ring buffer in shared memory.
+    GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size);
+    GLSL("// pl_shader_error_diffusion                                          \n"
+         // Safeguard against accidental over-execution
+         "if (gl_WorkGroupID != uvec3(0))                                       \n"
+         "    return;                                                           \n"
+         // Initialize the ring buffer.
+         "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n"
+         "    err_rgb8[i] = 0u;                                                 \n"
+
+        // Main block loop, add barrier here to have previous block all
+        // processed before starting the processing of the next.
+         "for (uint block_id = 0; block_id < "$"; block_id++) {                 \n"
+         "barrier();                                                            \n"
+        // Compute the coordinate of the pixel we are currently processing,
+        // both before and after the shift mapping.
+         "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex;    \n"
+         "const uint height = "$";                                              \n"
+         "int y = int(id %% height), x_shifted = int(id / height);              \n"
+         "int x = x_shifted - y * %d;                                           \n"
+         // Proceed only if we are processing a valid pixel.
+         "if (x >= 0 && x < "$") {                                              \n"
+         // The index that the current pixel have on the ring buffer.
+         "uint idx = uint(x_shifted * "$" + y) %% "$";                          \n"
+         // Fetch the current pixel.
+         "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0);                      \n"
+         "vec3 pix = pix_orig.rgb;                                              \n",
+         ring_buffer_size,
+         SH_UINT(blocks),
+         SH_UINT(height),
+         kernel->shift,
+         SH_INT(width),
+         SH_INT(ring_buffer_rows),
+         ring_buffer_size,
+         in_tex);
+
+    // The dithering will quantize pixel value into multiples of 1/dither_quant.
+    int dither_quant = (1 << params->new_depth) - 1;
+
+    // We encode errors in RGB components into a single 32-bit unsigned integer.
+    // The error we propagate from the current pixel is in range of
+    // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the
+    // sum of all errors been propagated into a pixel is also in the same range.
+    // It's possible to map errors in this range into [-127, 127], and use an
+    // unsigned 8-bit integer to store it (using standard two's complement).
+    // The three 8-bit unsigned integers can then be encoded into a single
+    // 32-bit unsigned integer, with two 4-bit padding to prevent addition
+    // operation overflows affecting other component. There are at most 12
+    // addition operations on each pixel, so 4-bit padding should be enough.
+    // The overflow from R component will be discarded.
+    //
+    // The following figure is how the encoding looks like.
+    //
+    //     +------------------------------------+
+    //     |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB|
+    //     +------------------------------------+
+    //
+
+    // The bitshift position for R and G component.
+    const int bitshift_r = 24, bitshift_g = 12;
+    // The multiplier we use to map [-0.5, 0.5] to [-127, 127].
+    const int uint8_mul = 127 * 2;
+
+    GLSL(// Add the error previously propagated into current pixel, and clear
+         // it in the ring buffer.
+         "uint err_u32 = err_rgb8[idx] + %uu;                                   \n"
+         "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128,           \n"
+         "                        int((err_u32 >> %d) & 0xFFu) - 128,           \n"
+         "                        int( err_u32        & 0xFFu) - 128) / %d.0;   \n"
+         "err_rgb8[idx] = 0u;                                                   \n"
+         // Write the dithered pixel.
+         "vec3 dithered = round(pix);                                           \n"
+         "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a));      \n"
+         // Prepare for error propagation pass
+         "vec3 err_divided = (pix - dithered) * %d.0 / %d.0;                    \n"
+         "ivec3 tmp;                                                            \n",
+         (128u << bitshift_r) | (128u << bitshift_g) | 128u,
+         dither_quant, bitshift_r, bitshift_g, uint8_mul,
+         out_img, dither_quant,
+         uint8_mul, kernel->divisor);
+
+    // Group error propagation with same weight factor together, in order to
+    // reduce the number of annoying error encoding.
+    for (int dividend = 1; dividend <= kernel->divisor; dividend++) {
+        bool err_assigned = false;
+
+        for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+            for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+                if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend)
+                    continue;
+
+                if (!err_assigned) {
+                    err_assigned = true;
+
+                    GLSL("tmp = ivec3(round(err_divided * %d.0));   \n"
+                         "err_u32 = (uint(tmp.r & 0xFF) << %d) |    \n"
+                         "          (uint(tmp.g & 0xFF) << %d) |    \n"
+                         "           uint(tmp.b & 0xFF);            \n",
+                         dividend,
+                         bitshift_r, bitshift_g);
+                }
+
+                int shifted_x = x + y * kernel->shift;
+
+                // Unlike the right border, errors propagated out from left
+                // border will remain in the ring buffer. This will produce
+                // visible artifacts near the left border, especially for
+                // shift=3 kernels.
+                if (x < 0)
+                    GLSL("if (x >= %d) \n", -x);
+
+                // Calculate the new position in the ring buffer to propagate
+                // the error into.
+                int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
+                GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n",
+                     ring_buffer_delta, ring_buffer_size);
+            }
+        }
+    }
+
+    GLSL("}} \n"); // end of main loop + valid pixel conditional
+    return true;
+}
diff --git a/src/shaders/film_grain.c b/src/shaders/film_grain.c
new file mode 100644
index 0000000..b1d25ff
--- /dev/null
+++ b/src/shaders/film_grain.c
@@ -0,0 +1,65 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+bool pl_needs_film_grain(const struct pl_film_grain_params *params)
+{
+    switch (params->data.type) {
+    case PL_FILM_GRAIN_NONE: return false;
+    case PL_FILM_GRAIN_AV1:  return pl_needs_fg_av1(params);
+    case PL_FILM_GRAIN_H274: return pl_needs_fg_h274(params);
+    default: pl_unreachable();
+    }
+}
+
+struct sh_grain_obj {
+    pl_shader_obj av1;
+    pl_shader_obj h274;
+};
+
+static void sh_grain_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_grain_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->av1);
+    pl_shader_obj_destroy(&obj->h274);
+}
+
+bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state,
+                          const struct pl_film_grain_params *params)
+{
+    if (!pl_needs_film_grain(params)) {
+        // FIXME: Instead of erroring, sample directly
+        SH_FAIL(sh, "pl_shader_film_grain called but no film grain needs to be "
+                    "applied, test with `pl_needs_film_grain` first!");
+        return false;
+    }
+
+    struct sh_grain_obj *obj;
+    obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_FILM_GRAIN,
+                 struct sh_grain_obj, sh_grain_uninit);
+    if (!obj)
+        return false;
+
+    switch (params->data.type) {
+    case PL_FILM_GRAIN_NONE: return false;
+    case PL_FILM_GRAIN_AV1:  return pl_shader_fg_av1(sh, &obj->av1, params);
+    case PL_FILM_GRAIN_H274: return pl_shader_fg_h274(sh, &obj->h274, params);
+    default: pl_unreachable();
+    }
+}
diff --git a/src/shaders/film_grain.h b/src/shaders/film_grain.h
new file mode 100644
index 0000000..f6498c1
--- /dev/null
+++ b/src/shaders/film_grain.h
@@ -0,0 +1,75 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/shaders/film_grain.h>
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *);
+bool pl_needs_fg_h274(const struct pl_film_grain_params *);
+
+bool pl_shader_fg_av1(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+bool pl_shader_fg_h274(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+
+// Common helper function
+static inline enum pl_channel channel_map(int i, const struct pl_film_grain_params *params)
+{
+    static const enum pl_channel map_rgb[3] = {
+        [PL_CHANNEL_G] = PL_CHANNEL_Y,
+        [PL_CHANNEL_B] = PL_CHANNEL_CB,
+        [PL_CHANNEL_R] = PL_CHANNEL_CR,
+    };
+
+    static const enum pl_channel map_xyz[3] = {
+        [1] = PL_CHANNEL_Y,  // Y
+        [2] = PL_CHANNEL_CB, // Z
+        [0] = PL_CHANNEL_CR, // X
+    };
+
+    if (i >= params->components)
+        return PL_CHANNEL_NONE;
+
+    int comp = params->component_mapping[i];
+    if (comp < 0 || comp > 2)
+        return PL_CHANNEL_NONE;
+
+    switch (params->repr->sys) {
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+        return map_rgb[comp];
+    case PL_COLOR_SYSTEM_XYZ:
+        return map_xyz[comp];
+
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_BT_2020_C:
+    case PL_COLOR_SYSTEM_BT_2100_PQ:
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+    case PL_COLOR_SYSTEM_YCGCO:
+        return comp;
+
+    case PL_COLOR_SYSTEM_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
diff --git a/src/shaders/film_grain_av1.c b/src/shaders/film_grain_av1.c
new file mode 100644
index 0000000..3b11ea3
--- /dev/null
+++ b/src/shaders/film_grain_av1.c
@@ -0,0 +1,1001 @@
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain_av1.c) is also available
+ * under the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+static const int16_t gaussian_sequence[2048] = {
+  56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+  224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+  112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+  -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+  432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+  192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+  540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+  248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+  248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+  340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+  220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+  -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+  60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+  488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+  -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+  -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+  -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+  -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+  728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+  4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+  772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+  -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+  -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+  -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+  1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+  204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+  548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+  -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+  96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+  -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+  240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+  -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+  896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+  -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+  -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+  -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+  -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+  -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+  424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+  436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+  -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+  -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+  496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+  56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+  -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+  540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+  424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+  -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+  756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+  -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+  60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+  -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+  -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+  308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+  -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+  -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+  284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+  264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+  -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+  908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+  124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+  1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+  -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+  -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+  -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+  320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+  -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+  -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+  -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+  -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+  -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+  636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+  -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+  -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+  392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+  -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+  -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+  -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+  756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+  -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+  472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+  844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+  60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+  -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+  -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+  472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+  652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+  -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+  -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+  -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+  -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+  220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+  412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+  320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+  372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+  924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+  332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+  436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+  -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+  1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+  -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+  -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+  -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+  528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+  -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+  -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+  1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+  20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+  96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+  192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+  648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+  816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+  648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+  -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+  -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+  -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+  384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+  -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+  -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+  64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+  -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+  128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+  112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+  828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+  -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+  0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+  -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+  24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+  508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+  716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+  600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+  -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+  -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+  344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+  -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+  164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+  192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+  288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+  -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+  -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+  556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+  268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+  884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+  -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+  -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+  244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+  -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+  -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+  -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+  1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+  -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+  344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+  -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+  1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+  -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+  504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+  76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+  116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+  28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+  -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+  -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+  -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+  -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+  252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+  312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+  732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+  124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+  -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+  440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+  -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+  648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+  680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+  -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+  -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+  -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+  -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+  372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+  -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+  -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+  -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+  -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+  52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+  716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+  -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+  -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+  104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+  428,   -484
+};
+
+static inline int get_random_number(int bits, uint16_t *state)
+{
+    int r = *state;
+    uint16_t bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(int x, int shift)
+{
+    if (!shift)
+        return x;
+
+    return (x + (1 << (shift - 1))) >> shift;
+}
+
+enum {
+    BLOCK_SIZE = 32,
+    SCALING_LUT_SIZE = 256,
+
+    GRAIN_WIDTH = 82,
+    GRAIN_HEIGHT = 73,
+    // On the GPU we only need a subsection of this
+    GRAIN_WIDTH_LUT = 64,
+    GRAIN_HEIGHT_LUT = 64,
+    GRAIN_PAD_LUT = 9,
+
+    // For subsampled grain textures
+    SUB_GRAIN_WIDTH = 44,
+    SUB_GRAIN_HEIGHT = 38,
+    SUB_GRAIN_WIDTH_LUT = GRAIN_WIDTH_LUT >> 1,
+    SUB_GRAIN_HEIGHT_LUT = GRAIN_HEIGHT_LUT >> 1,
+    SUB_GRAIN_PAD_LUT = 6,
+};
+
+// Contains the shift by which the offsets are indexed
+enum offset {
+    OFFSET_TL = 24,
+    OFFSET_T  = 16,
+    OFFSET_L  = 8,
+    OFFSET_N  = 0,
+};
+
+// Helper function to compute some common constants
+struct grain_scale {
+    int grain_center;
+    int grain_min;
+    int grain_max;
+    float texture_scale;
+    float grain_scale;
+};
+
+static inline int bit_depth(const struct pl_color_repr *repr)
+{
+    int depth = PL_DEF(repr->bits.color_depth,
+                PL_DEF(repr->bits.sample_depth, 8));
+    pl_assert(depth >= 8);
+    return PL_MIN(depth, 12);
+}
+
+static struct grain_scale get_grain_scale(const struct pl_film_grain_params *params)
+{
+    int bits = bit_depth(params->repr);
+    struct grain_scale ret = {
+        .grain_center = 128 << (bits - 8),
+    };
+
+    ret.grain_min = -ret.grain_center;
+    ret.grain_max = (256 << (bits - 8)) - 1 - ret.grain_center;
+
+    struct pl_color_repr repr = *params->repr;
+    ret.texture_scale = pl_color_repr_normalize(&repr);
+
+    // Since our color samples are normalized to the range [0, 1], we need to
+    // scale down grain values from the scale [0, 2^b - 1] to this range.
+    ret.grain_scale = 1.0 / ((1 << bits) - 1);
+
+    return ret;
+}
+
+// Generates the basic grain table (LumaGrain in the spec).
+static void generate_grain_y(float out[GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT],
+                             int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+                             const struct pl_film_grain_params *params)
+{
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    struct grain_scale scale = get_grain_scale(params);
+    uint16_t seed = (uint16_t) params->data.seed;
+    int bits = bit_depth(params->repr);
+    int shift = 12 - bits + data->grain_scale_shift;
+    pl_assert(shift >= 0);
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+            buf[y][x] = round2(value, shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+            buf[y][x] = grain;
+        }
+    }
+
+    for (int y = 0; y < GRAIN_HEIGHT_LUT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH_LUT; x++) {
+            int16_t grain = buf[y + GRAIN_PAD_LUT][x + GRAIN_PAD_LUT];
+            out[y][x] = grain * scale.grain_scale;
+        }
+    }
+}
+
+static void generate_grain_uv(float *out, int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+                              const int16_t buf_y[GRAIN_HEIGHT][GRAIN_WIDTH],
+                              enum pl_channel channel, int sub_x, int sub_y,
+                              const struct pl_film_grain_params *params)
+{
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    struct grain_scale scale = get_grain_scale(params);
+    int bits = bit_depth(params->repr);
+    int shift = 12 - bits + data->grain_scale_shift;
+    pl_assert(shift >= 0);
+
+    uint16_t seed = params->data.seed;
+    if (channel == PL_CHANNEL_CB) {
+        seed ^= 0xb524;
+    } else if (channel == PL_CHANNEL_CR) {
+        seed ^= 0x49d8;
+    }
+
+    int chromaW = sub_x ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    int chromaH = sub_y ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    const int8_t *coeffs[] = {
+        [PL_CHANNEL_CB] = data->ar_coeffs_uv[0],
+        [PL_CHANNEL_CR] = data->ar_coeffs_uv[1],
+    };
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+            buf[y][x] = round2(value, shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = coeffs[channel];
+            pl_assert(coeff);
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        if (!data->num_points_y)
+                            break;
+                        int luma = 0;
+                        int lumaX = ((x - ar_pad) << sub_x) + ar_pad;
+                        int lumaY = ((y - ar_pad) << sub_y) + ar_pad;
+                        for (int i = 0; i <= sub_y; i++) {
+                            for (int j = 0; j <= sub_x; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, sub_x + sub_y);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+            buf[y][x] = grain;
+        }
+    }
+
+    int lutW = GRAIN_WIDTH_LUT >> sub_x;
+    int lutH = GRAIN_HEIGHT_LUT >> sub_y;
+    int padX = sub_x ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+    int padY = sub_y ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+
+    for (int y = 0; y < lutH; y++) {
+        for (int x = 0; x < lutW; x++) {
+            int16_t grain = buf[y + padY][x + padX];
+            out[y * lutW + x] = grain * scale.grain_scale;
+        }
+    }
+}
+
+static void generate_offsets(void *pbuf, const struct sh_lut_params *params)
+{
+    const struct pl_film_grain_data *data = params->priv;
+    unsigned int *buf = pbuf;
+    pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+
+    for (int y = 0; y < params->height; y++) {
+        uint16_t state = data->seed;
+        state ^= ((y * 37 + 178) & 0xFF) << 8;
+        state ^= ((y * 173 + 105) & 0xFF);
+
+        for (int x = 0; x < params->width; x++) {
+            unsigned int *offsets = &buf[y * params->width + x];
+
+            uint8_t val = get_random_number(8, &state);
+            uint8_t val_l = x ? (offsets - 1)[0] : 0;
+            uint8_t val_t = y ? (offsets - params->width)[0] : 0;
+            uint8_t val_tl = x && y ? (offsets - params->width - 1)[0] : 0;
+
+            // Encode four offsets into a single 32-bit integer for the
+            // convenience of the GPU. That way only one LUT fetch is
+            // required for the entire block.
+            *offsets = ((uint32_t) val_tl << OFFSET_TL)
+                     | ((uint32_t) val_t  << OFFSET_T)
+                     | ((uint32_t) val_l  << OFFSET_L)
+                     | ((uint32_t) val    << OFFSET_N);
+        }
+    }
+}
+
+static void generate_scaling(void *pdata, const struct sh_lut_params *params)
+{
+    assert(params->width == SCALING_LUT_SIZE && params->comps == 1);
+    float *data = pdata;
+
+    struct {
+        int num;
+        uint8_t (*points)[2];
+        const struct pl_av1_grain_data *data;
+    } *ctx = params->priv;
+
+    float range = 1 << ctx->data->scaling_shift;
+
+    // Fill up the preceding entries with the initial value
+    for (int i = 0; i < ctx->points[0][0]; i++)
+        data[i] = ctx->points[0][1] / range;
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < ctx->num - 1; i++) {
+        int bx = ctx->points[i][0];
+        int by = ctx->points[i][1];
+        int dx = ctx->points[i + 1][0] - bx;
+        int dy = ctx->points[i + 1][1] - by;
+        int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+        for (int x = 0; x < dx; x++) {
+            int v = by + ((x * delta + 0x8000) >> 16);
+            data[bx + x] = v / range;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    for (int i = ctx->points[ctx->num - 1][0]; i < SCALING_LUT_SIZE; i++)
+        data[i] = ctx->points[ctx->num - 1][1] / range;
+}
+
+static void sample(pl_shader sh, enum offset off, ident_t lut, int idx,
+                   int sub_x, int sub_y)
+{
+    int dx = (off & OFFSET_L) ? 1 : 0,
+        dy = (off & OFFSET_T) ? 1 : 0;
+
+    static const char *index_strs[] = {
+        [0] = ".x",
+        [1] = ".y",
+    };
+
+    GLSL("offset = uvec2(%du, %du) * uvec2((data >> %d) & 0xFu, \n"
+         "                                 (data >> %d) & 0xFu);\n"
+         "pos = offset + local_id.xy + uvec2(%d, %d);           \n"
+         "val = "$"(pos)%s;                                     \n",
+         sub_x ? 1 : 2, sub_y ? 1 : 2, off + 4, off,
+         (BLOCK_SIZE >> sub_x) * dx,
+         (BLOCK_SIZE >> sub_y) * dy,
+         lut, idx >= 0 ? index_strs[idx] : "");
+}
+
+struct grain_obj_av1 {
+    // LUT objects for the offsets, grain and scaling luts
+    pl_shader_obj lut_offsets;
+    pl_shader_obj lut_grain[2];
+    pl_shader_obj lut_scaling[3];
+
+    // Previous parameters used to check reusability
+    struct pl_film_grain_data data;
+    struct pl_color_repr repr;
+    bool fg_has_y;
+    bool fg_has_u;
+    bool fg_has_v;
+
+    // Space to store the temporary arrays, reused
+    uint32_t *offsets;
+    float grain[2][GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT];
+    int16_t grain_tmp_y[GRAIN_HEIGHT][GRAIN_WIDTH];
+    int16_t grain_tmp_uv[GRAIN_HEIGHT][GRAIN_WIDTH];
+};
+
+static void av1_grain_uninit(pl_gpu gpu, void *ptr)
+{
+    struct grain_obj_av1 *obj = ptr;
+    pl_shader_obj_destroy(&obj->lut_offsets);
+    for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_grain); i++)
+        pl_shader_obj_destroy(&obj->lut_grain[i]);
+    for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_scaling); i++)
+        pl_shader_obj_destroy(&obj->lut_scaling[i]);
+    *obj = (struct grain_obj_av1) {0};
+}
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *params)
+{
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    bool has_y = data->num_points_y > 0;
+    bool has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+    bool has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+    for (int i = 0; i < 3; i++) {
+        enum pl_channel channel = channel_map(i, params);
+        if (channel == PL_CHANNEL_Y && has_y)
+            return true;
+        if (channel == PL_CHANNEL_CB && has_u)
+            return true;
+        if (channel == PL_CHANNEL_CR && has_v)
+            return true;
+    }
+
+    return false;
+}
+
+static inline bool av1_grain_data_eq(const struct pl_film_grain_data *da,
+                                     const struct pl_film_grain_data *db)
+{
+    const struct pl_av1_grain_data *a = &da->params.av1, *b = &db->params.av1;
+
+    // Only check the fields that are relevant for grain LUT generation
+    return da->seed == db->seed &&
+           a->chroma_scaling_from_luma == b->chroma_scaling_from_luma &&
+           a->scaling_shift == b->scaling_shift &&
+           a->ar_coeff_lag == b->ar_coeff_lag &&
+           a->ar_coeff_shift == b->ar_coeff_shift &&
+           a->grain_scale_shift == b->grain_scale_shift &&
+           !memcmp(a->ar_coeffs_y, b->ar_coeffs_y, sizeof(a->ar_coeffs_y)) &&
+           !memcmp(a->ar_coeffs_uv, b->ar_coeffs_uv, sizeof(a->ar_coeffs_uv));
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+    struct grain_obj_av1 *obj = params->priv;
+    size_t entries = params->width * params->height * params->comps;
+    memcpy(data, obj->grain, entries * sizeof(float));
+}
+
+bool pl_shader_fg_av1(pl_shader sh, pl_shader_obj *grain_state,
+                      const struct pl_film_grain_params *params)
+{
+    int sub_x = 0, sub_y = 0;
+    int tex_w = params->tex->params.w,
+        tex_h = params->tex->params.h;
+
+    if (params->luma_tex) {
+        sub_x = params->luma_tex->params.w > tex_w;
+        sub_y = params->luma_tex->params.h > tex_h;
+    }
+
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    bool fg_has_y = data->num_points_y > 0;
+    bool fg_has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+    bool fg_has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+    bool tex_is_y = false, tex_is_cb = false, tex_is_cr = false;
+    for (int i = 0; i < 3; i++) {
+        switch (channel_map(i, params)) {
+        case PL_CHANNEL_Y:  tex_is_y = true; break;
+        case PL_CHANNEL_CB: tex_is_cb = true; break;
+        case PL_CHANNEL_CR: tex_is_cr = true; break;
+        default: break;
+        };
+    }
+
+    if (tex_is_y && (sub_x || sub_y)) {
+        PL_WARN(sh, "pl_film_grain_params.channels includes PL_CHANNEL_Y but "
+                "plane is subsampled, this makes no sense. Continuing anyway "
+                "but output is likely incorrect.");
+    }
+
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, tex_w, tex_h))
+        return false;
+
+    pl_gpu gpu = SH_GPU(sh);
+    if (!gpu) {
+        PL_ERR(sh, "AV1 film grain synthesis requires a non-NULL pl_gpu!");
+        return false;
+    }
+
+    // Disable generation for unneeded component types
+    fg_has_y &= tex_is_y;
+    fg_has_u &= tex_is_cb;
+    fg_has_v &= tex_is_cr;
+
+    int bw = BLOCK_SIZE >> sub_x;
+    int bh = BLOCK_SIZE >> sub_y;
+    bool is_compute = sh_try_compute(sh, bw, bh, false, sizeof(uint32_t));
+
+    struct grain_obj_av1 *obj;
+    obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_AV1_GRAIN,
+                 struct grain_obj_av1, av1_grain_uninit);
+    if (!obj)
+        return false;
+
+    // Note: In theory we could check only the parameters related to luma or
+    // only related to chroma and skip updating for changes to irrelevant
+    // parts, but this is probably not worth it since the seed is expected to
+    // change per frame anyway.
+    bool needs_update = !av1_grain_data_eq(&params->data, &obj->data) ||
+                        !pl_color_repr_equal(params->repr, &obj->repr) ||
+                        fg_has_y != obj->fg_has_y ||
+                        fg_has_u != obj->fg_has_u ||
+                        fg_has_v != obj->fg_has_v;
+
+    if (needs_update) {
+        // This is needed even for chroma, so statically generate it
+        generate_grain_y(obj->grain[0], obj->grain_tmp_y, params);
+    }
+
+    ident_t lut[3];
+    int idx[3] = {-1};
+
+    if (fg_has_y) {
+        lut[0] = sh_lut(sh, sh_lut_params(
+            .object     = &obj->lut_grain[0],
+            .var_type   = PL_VAR_FLOAT,
+            .lut_type   = SH_LUT_TEXTURE,
+            .width      = GRAIN_WIDTH_LUT,
+            .height     = GRAIN_HEIGHT_LUT,
+            .comps      = 1,
+            .update     = needs_update,
+            .dynamic    = true,
+            .fill       = fill_grain_lut,
+            .priv       = obj,
+        ));
+
+        if (!lut[0]) {
+            SH_FAIL(sh, "Failed generating/uploading luma grain LUT!");
+            return false;
+        }
+    }
+
+    // Try merging the chroma LUTs into a single texture
+    int chroma_comps = 0;
+    if (fg_has_u) {
+        generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+                          obj->grain_tmp_y, PL_CHANNEL_CB, sub_x, sub_y,
+                          params);
+        idx[1] = chroma_comps++;
+    }
+    if (fg_has_v) {
+        generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+                          obj->grain_tmp_y, PL_CHANNEL_CR, sub_x, sub_y,
+                          params);
+        idx[2] = chroma_comps++;
+    }
+
+    if (chroma_comps > 0) {
+        lut[1] = lut[2] = sh_lut(sh, sh_lut_params(
+            .object     = &obj->lut_grain[1],
+            .var_type   = PL_VAR_FLOAT,
+            .lut_type   = SH_LUT_TEXTURE,
+            .width      = GRAIN_WIDTH_LUT >> sub_x,
+            .height     = GRAIN_HEIGHT_LUT >> sub_y,
+            .comps      = chroma_comps,
+            .update     = needs_update,
+            .dynamic    = true,
+            .fill       = fill_grain_lut,
+            .priv       = obj,
+        ));
+
+        if (!lut[1]) {
+            SH_FAIL(sh, "Failed generating/uploading chroma grain LUT!");
+            return false;
+        }
+
+        if (chroma_comps == 1)
+            idx[1] = idx[2] = -1;
+    }
+
+    ident_t offsets = sh_lut(sh, sh_lut_params(
+        .object     = &obj->lut_offsets,
+        .var_type   = PL_VAR_UINT,
+        .lut_type   = SH_LUT_AUTO,
+        .width      = PL_ALIGN2(tex_w << sub_x, 128) / 32,
+        .height     = PL_ALIGN2(tex_h << sub_y, 128) / 32,
+        .comps      = 1,
+        .update     = needs_update,
+        .dynamic    = true,
+        .fill       = generate_offsets,
+        .priv       = (void *) &params->data,
+    ));
+
+    if (!offsets) {
+        SH_FAIL(sh, "Failed generating/uploading block offsets LUT!");
+        return false;
+    }
+
+    // For the scaling LUTs, we assume they'll be relatively constant
+    // throughout the video so doing some extra work to avoid reinitializing
+    // them constantly is probably worth it. Probably.
+    const struct pl_av1_grain_data *obj_data = &obj->data.params.av1;
+    bool scaling_changed = false;
+    if (fg_has_y || data->chroma_scaling_from_luma) {
+        scaling_changed |= data->num_points_y != obj_data->num_points_y;
+        scaling_changed |= memcmp(data->points_y, obj_data->points_y,
+                                  sizeof(data->points_y));
+    }
+
+    if (fg_has_u && !data->chroma_scaling_from_luma) {
+        scaling_changed |= data->num_points_uv[0] != obj_data->num_points_uv[0];
+        scaling_changed |= memcmp(data->points_uv[0],
+                                  obj_data->points_uv[0],
+                                  sizeof(data->points_uv[0]));
+    }
+
+    if (fg_has_v && !data->chroma_scaling_from_luma) {
+        scaling_changed |= data->num_points_uv[1] != obj_data->num_points_uv[1];
+        scaling_changed |= memcmp(data->points_uv[1],
+                                  obj_data->points_uv[1],
+                                  sizeof(data->points_uv[1]));
+    }
+
+    ident_t scaling[3] = {0};
+    for (int i = 0; i < 3; i++) {
+        struct {
+            int num;
+            const uint8_t (*points)[2];
+            const struct pl_av1_grain_data *data;
+        } priv;
+
+        priv.data = data;
+        if (i == 0 || data->chroma_scaling_from_luma) {
+            priv.num = data->num_points_y;
+            priv.points = &data->points_y[0];
+        } else {
+            priv.num = data->num_points_uv[i - 1];
+            priv.points = &data->points_uv[i - 1][0];
+        }
+
+        // Skip scaling for unneeded channels
+        bool has_c[3] = { fg_has_y, fg_has_u, fg_has_v };
+        if (has_c[i] && priv.num > 0) {
+            scaling[i] = sh_lut(sh, sh_lut_params(
+                .object     = &obj->lut_scaling[i],
+                .var_type   = PL_VAR_FLOAT,
+                .method     = SH_LUT_LINEAR,
+                .width      = SCALING_LUT_SIZE,
+                .comps      = 1,
+                .update     = scaling_changed,
+                .dynamic    = true,
+                .fill       = generate_scaling,
+                .priv       = &priv,
+            ));
+
+            if (!scaling[i]) {
+                SH_FAIL(sh, "Failed generating/uploading scaling LUTs!");
+                return false;
+            }
+        }
+    }
+
+    // Done updating LUTs
+    obj->data = params->data;
+    obj->repr = *params->repr;
+    obj->fg_has_y = fg_has_y;
+    obj->fg_has_u = fg_has_u;
+    obj->fg_has_v = fg_has_v;
+
+    sh_describe(sh, "AV1 film grain");
+    GLSL("vec4 color;                   \n"
+         "// pl_shader_film_grain (AV1) \n"
+         "{                             \n"
+         "uvec2 offset;                 \n"
+         "uvec2 pos;                    \n"
+         "float val;                    \n"
+         "float grain;                  \n");
+
+    if (is_compute) {
+        GLSL("uvec2 block_id  = gl_WorkGroupID.xy;        \n"
+             "uvec2 local_id  = gl_LocalInvocationID.xy;  \n"
+             "uvec2 global_id = gl_GlobalInvocationID.xy; \n");
+    } else {
+        GLSL("uvec2 global_id = uvec2(gl_FragCoord);                  \n"
+             "uvec2 block_id  = global_id / uvec2(%d, %d);            \n"
+             "uvec2 local_id  = global_id - uvec2(%d, %d) * block_id; \n",
+             bw, bh, bw, bh);
+    }
+
+    // Load the data vector which holds the offsets
+    if (is_compute) {
+        ident_t id = sh_fresh(sh, "data");
+        GLSLH("shared uint "$"; \n", id);
+        GLSL("if (gl_LocalInvocationIndex == 0u)    \n"
+             "    "$" = uint("$"(block_id));        \n"
+             "barrier();                            \n"
+             "uint data = "$";                      \n",
+             id, offsets, id);
+    } else {
+        GLSL("uint data = uint("$"(block_id)); \n", offsets);
+    }
+
+    struct grain_scale scale = get_grain_scale(params);
+    pl_color_repr_normalize(params->repr);
+    int bits = PL_DEF(params->repr->bits.color_depth, 8);
+    pl_assert(bits >= 8);
+
+    ident_t minValue, maxLuma, maxChroma;
+    if (pl_color_levels_guess(params->repr) == PL_COLOR_LEVELS_LIMITED) {
+        float out_scale = (1 << bits) / ((1 << bits) - 1.0);
+        minValue  = SH_FLOAT(16  / 256.0 * out_scale);
+        maxLuma   = SH_FLOAT(235 / 256.0 * out_scale);
+        maxChroma = SH_FLOAT(240 / 256.0 * out_scale);
+        if (!pl_color_system_is_ycbcr_like(params->repr->sys))
+            maxChroma = maxLuma;
+    } else {
+        minValue  = SH_FLOAT(0.0);
+        maxLuma   = SH_FLOAT(1.0);
+        maxChroma = SH_FLOAT(1.0);
+    }
+
+    // Load the color value of the tex itself
+    ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->tex,
+        .desc = (struct pl_desc) {
+            .name = "tex",
+            .type = PL_DESC_SAMPLED_TEX,
+        },
+    });
+
+    ident_t tex_scale = SH_FLOAT(scale.texture_scale);
+    GLSL("color = vec4("$") * texelFetch("$", ivec2(global_id), 0); \n",
+         tex_scale, tex);
+
+    // If we need access to the external luma plane, load it now
+    if (tex_is_cb || tex_is_cr) {
+        GLSL("float averageLuma; \n");
+        if (tex_is_y) {
+            // We already have the luma channel as part of the pre-sampled color
+            for (int i = 0; i < 3; i++) {
+                if (channel_map(i, params) == PL_CHANNEL_Y) {
+                    GLSL("averageLuma = color["$"]; \n", SH_INT(i));
+                    break;
+                }
+            }
+        } else {
+            // Luma channel not present in image, attach it separately
+            pl_assert(params->luma_tex);
+            ident_t luma = sh_desc(sh, (struct pl_shader_desc) {
+                .binding.object = params->luma_tex,
+                .desc = (struct pl_desc) {
+                    .name = "luma",
+                    .type = PL_DESC_SAMPLED_TEX,
+                },
+            });
+
+            GLSL("pos = global_id * uvec2(%du, %du);                    \n"
+                 "averageLuma = texelFetch("$", ivec2(pos), 0)["$"];    \n"
+                 "averageLuma *= "$";                                   \n",
+                 1 << sub_x, 1 << sub_y,
+                 luma, SH_INT(params->luma_comp),
+                 tex_scale);
+        }
+    }
+
+    ident_t grain_min = SH_FLOAT(scale.grain_min * scale.grain_scale);
+    ident_t grain_max = SH_FLOAT(scale.grain_max * scale.grain_scale);
+
+    for (int i = 0; i < params->components; i++) {
+        enum pl_channel c = channel_map(i, params);
+        if (c == PL_CHANNEL_NONE)
+            continue;
+        if (!scaling[c])
+            continue;
+
+        sample(sh, OFFSET_N, lut[c], idx[c], sub_x, sub_y);
+        GLSL("grain = val; \n");
+
+        if (data->overlap) {
+            const char *weights[] = { "vec2(27.0, 17.0)", "vec2(23.0, 22.0)" };
+
+            // X-direction overlapping
+            GLSL("if (block_id.x > 0u && local_id.x < %du) {    \n"
+                 "vec2 w = %s / 32.0;                           \n"
+                 "if (local_id.x == 1u) w.xy = w.yx;            \n",
+                 2 >> sub_x, weights[sub_x]);
+            sample(sh, OFFSET_L, lut[c], idx[c], sub_x, sub_y);
+            GLSL("grain = dot(vec2(val, grain), w);             \n"
+                 "}                                             \n");
+
+            // Y-direction overlapping
+            GLSL("if (block_id.y > 0u && local_id.y < %du) {    \n"
+                 "vec2 w = %s / 32.0;                           \n"
+                 "if (local_id.y == 1u) w.xy = w.yx;            \n",
+                 2 >> sub_y, weights[sub_y]);
+
+            // We need to special-case the top left pixels since these need to
+            // pre-blend the top-left offset block before blending vertically
+            GLSL("    if (block_id.x > 0u && local_id.x < %du) {\n"
+                 "        vec2 w2 = %s / 32.0;                  \n"
+                 "        if (local_id.x == 1u) w2.xy = w2.yx;  \n",
+                 2 >> sub_x, weights[sub_x]);
+                          sample(sh, OFFSET_TL, lut[c], idx[c], sub_x, sub_y);
+            GLSL("        float tmp = val;                      \n");
+                          sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+            GLSL("        val = dot(vec2(tmp, val), w2);        \n"
+                 "    } else {                                  \n");
+                          sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+            GLSL("    }                                         \n"
+                 "grain = dot(vec2(val, grain), w);             \n"
+                 "}                                             \n");
+
+            // Correctly clip the interpolated grain
+            GLSL("grain = clamp(grain, "$", "$"); \n", grain_min, grain_max);
+        }
+
+        if (c == PL_CHANNEL_Y) {
+            GLSL("color[%d] += "$"(color[%d]) * grain;      \n"
+                 "color[%d] = clamp(color[%d], "$", "$");   \n",
+                 i, scaling[c], i,
+                 i, i, minValue, maxLuma);
+        } else {
+            GLSL("val = averageLuma; \n");
+            if (!data->chroma_scaling_from_luma) {
+                // We need to load some extra variables for the mixing. Do this
+                // using sh_var instead of hard-coding them to avoid shader
+                // recompilation when these values change.
+                ident_t mult = sh_var(sh, (struct pl_shader_var) {
+                    .var = pl_var_vec2("mult"),
+                    .data = &(float[2]){
+                        data->uv_mult_luma[c - 1] / 64.0,
+                        data->uv_mult[c - 1] / 64.0,
+                    },
+                });
+
+                int c_offset = (unsigned) data->uv_offset[c - 1] << (bits - 8);
+                ident_t offset = sh_var(sh, (struct pl_shader_var) {
+                    .var = pl_var_float("offset"),
+                    .data = &(float) { c_offset * scale.grain_scale },
+                });
+
+                GLSL("val = dot(vec2(val, color[%d]), "$"); \n"
+                     "val += "$";                           \n",
+                     i, mult, offset);
+            }
+            GLSL("color[%d] += "$"(val) * grain;            \n"
+                 "color[%d] = clamp(color[%d], "$", "$");   \n",
+                 i, scaling[c],
+                 i, i, minValue, maxChroma);
+        }
+    }
+
+    GLSL("} \n");
+    return true;
+}
diff --git a/src/shaders/film_grain_h274.c b/src/shaders/film_grain_h274.c
new file mode 100644
index 0000000..6d524da
--- /dev/null
+++ b/src/shaders/film_grain_h274.c
@@ -0,0 +1,815 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+static const int8_t Gaussian_LUT[2048+4];
+static const uint32_t Seed_LUT[256];
+static const int8_t R64T[64][64];
+
+static void prng_shift(uint32_t *state)
+{
+    // Primitive polynomial x^31 + x^3 + 1 (modulo 2)
+    uint32_t x = *state;
+    uint8_t feedback = 1u ^ (x >> 2) ^ (x >> 30);
+    *state = (x << 1) | (feedback & 1u);
+}
+
+
+static void generate_slice(float *out, size_t out_width, uint8_t h, uint8_t v,
+                           int8_t grain[64][64], int16_t tmp[64][64])
+{
+    const uint8_t freq_h = ((h + 3) << 2) - 1;
+    const uint8_t freq_v = ((v + 3) << 2) - 1;
+    uint32_t seed = Seed_LUT[h + v * 13];
+
+    // Initialize with random gaussian values, using the output array as a
+    // temporary buffer for these intermediate values.
+    //
+    // Note: To make the subsequent matrix multiplication cache friendlier, we
+    // store each *column* of the starting image in a *row* of `grain`
+    for (int y = 0; y <= freq_v; y++) {
+        for (int x = 0; x <= freq_h; x += 4) {
+            uint16_t offset = seed % 2048;
+            grain[x + 0][y] = Gaussian_LUT[offset + 0];
+            grain[x + 1][y] = Gaussian_LUT[offset + 1];
+            grain[x + 2][y] = Gaussian_LUT[offset + 2];
+            grain[x + 3][y] = Gaussian_LUT[offset + 3];
+            prng_shift(&seed);
+        }
+    }
+
+    grain[0][0] = 0;
+
+    // 64x64 inverse integer transform
+    for (int y = 0; y < 64; y++) {
+        for (int x = 0; x <= freq_h; x++) {
+            int32_t sum = 0;
+            for (int p = 0; p <= freq_v; p++)
+                sum += R64T[y][p] * grain[x][p];
+            tmp[y][x] = (sum + 128) >> 8;
+        }
+    }
+
+    for (int y = 0; y < 64; y++) {
+        for (int x = 0; x < 64; x++) {
+            int32_t sum = 0;
+            for (int p = 0; p <= freq_h; p++)
+                sum += tmp[y][p] * R64T[x][p]; // R64T^T = R64
+            sum = (sum + 128) >> 8;
+            grain[y][x] = PL_CLAMP(sum, -127, 127);
+        }
+    }
+
+    static const uint8_t deblock_factors[13] = {
+        64, 71, 77, 84, 90, 96, 103, 109, 116, 122, 128, 128, 128
+    };
+
+    // Deblock horizontal edges by simple attentuation of values
+    const uint8_t deblock_coeff = deblock_factors[v];
+    for (int y = 0; y < 64; y++) {
+        switch (y % 8) {
+        case 0: case 7:
+            // Deblock
+            for (int x = 0; x < 64; x++)
+                out[x] = ((grain[y][x] * deblock_coeff) >> 7) / 255.0;
+            break;
+
+        case 1: case 2:
+        case 3: case 4:
+        case 5: case 6:
+            // No deblock
+            for (int x = 0; x < 64; x++)
+                out[x] = grain[y][x] / 255.0;
+            break;
+
+        default: pl_unreachable();
+        }
+
+        out += out_width;
+    }
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+    struct {
+        int8_t grain[64][64];
+        int16_t tmp[64][64];
+    } *tmp = pl_alloc_ptr(NULL, tmp);
+
+    float *out = data;
+    assert(params->var_type == PL_VAR_FLOAT);
+
+    for (int h = 0; h < 13; h++) {
+        for (int v = 0; v < 13; v++) {
+            float *slice = out + (h * 64) * params->width + (v * 64);
+            generate_slice(slice, params->width, h, v, tmp->grain, tmp->tmp);
+        }
+    }
+
+    pl_free(tmp);
+}
+
+bool pl_needs_fg_h274(const struct pl_film_grain_params *params)
+{
+    const struct pl_h274_grain_data *data = &params->data.params.h274;
+    if (data->model_id != 0)
+        return false;
+
+    for (int i = 0; i < 3; i++) {
+        enum pl_channel channel = channel_map(i, params);
+        if (channel < 0 || channel >= 3)
+            continue;
+        if (data->component_model_present[channel])
+            return true;
+    }
+
+    return false;
+}
+
+bool pl_shader_fg_h274(pl_shader sh, pl_shader_obj *grain_state,
+                       const struct pl_film_grain_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, params->tex->params.w, params->tex->params.h))
+        return false;
+
+    size_t shmem_req = 0;
+    ident_t group_sum = NULL_IDENT;
+
+    const struct pl_glsl_version glsl = sh_glsl(sh);
+    if (glsl.subgroup_size < 8*8) {
+        group_sum = sh_fresh(sh, "group_sum");
+        shmem_req += sizeof(int);
+        GLSLH("shared int "$"; \n", group_sum);
+        GLSL($" = 0; barrier(); \n", group_sum);
+    }
+
+    if (!sh_try_compute(sh, 8, 8, false, shmem_req)) {
+        SH_FAIL(sh, "H.274 film grain synthesis requires compute shaders!");
+        return false;
+    }
+
+    ident_t db = sh_lut(sh, sh_lut_params(
+        .object     = grain_state,
+        .var_type   = PL_VAR_FLOAT,
+        .lut_type   = SH_LUT_TEXTURE,
+        .width      = 13 * 64,
+        .height     = 13 * 64,
+        .comps      = 1,
+        .fill       = fill_grain_lut,
+        .signature  = CACHE_KEY_H274, // doesn't depend on anything
+        .cache      = SH_CACHE(sh),
+    ));
+
+    sh_describe(sh, "H.274 film grain");
+    GLSL("vec4 color;                       \n"
+         "// pl_shader_film_grain (H.274)   \n"
+         "{                                 \n");
+
+    // Load the color value of the tex itself
+    ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->tex,
+        .desc = (struct pl_desc) {
+            .name = "tex",
+            .type = PL_DESC_SAMPLED_TEX,
+        },
+    });
+
+    GLSL("ivec2 pos = ivec2(gl_GlobalInvocationID);     \n"
+         "color = vec4("$") * texelFetch("$", pos, 0);  \n",
+         SH_FLOAT(pl_color_repr_normalize(params->repr)), tex);
+
+    const struct pl_h274_grain_data *data = &params->data.params.h274;
+    ident_t scale_factor = sh_var(sh, (struct pl_shader_var) {
+        .var = pl_var_float("scale_factor"),
+        .data = &(float){ 1.0 / (1 << (data->log2_scale_factor + 6)) },
+    });
+
+    // pcg3d (http://www.jcgt.org/published/0009/03/02/)
+    GLSL("uvec3 pcg = uvec3("$", gl_WorkGroupID.xy / 2u);   \n"
+         "pcg = pcg * 1664525u + 1013904223u;               \n"
+         "pcg.x += pcg.y * pcg.z;                           \n"
+         "pcg.y += pcg.z * pcg.x;                           \n"
+         "pcg.z += pcg.x * pcg.y;                           \n"
+         "pcg ^= pcg >> 16u;                                \n"
+         "pcg.x += pcg.y * pcg.z;                           \n"
+         "pcg.y += pcg.z * pcg.x;                           \n"
+         "pcg.z += pcg.x * pcg.y;                           \n",
+         sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_uint("seed"),
+            .data = &(unsigned int){ params->data.seed },
+    }));
+
+    for (int idx = 0; idx < params->components; idx++) {
+        enum pl_channel c = channel_map(idx, params);
+        if (c == PL_CHANNEL_NONE)
+            continue;
+        if (!data->component_model_present[c])
+            continue;
+
+        GLSL("// component %d\n{\n", c);
+
+        // Compute the local 8x8 average
+        GLSL("float avg = color[%d] / 64.0; \n", c);
+
+        const int precision = 10000000;
+        if (glsl.subgroup_size) {
+            GLSL("avg = subgroupAdd(avg); \n");
+
+            if (glsl.subgroup_size < 8*8) {
+                GLSL("if (subgroupElect())                  \n"
+                     "    atomicAdd("$", int(avg * %d.0));  \n"
+                     "barrier();                            \n"
+                     "avg = float("$") / %d.0;              \n",
+                     group_sum, precision, group_sum, precision);
+            }
+        } else {
+            GLSL("atomicAdd("$", int(avg * %d.0));  \n"
+                 "barrier();                        \n"
+                 "avg = float("$") / %d.0;          \n",
+                 group_sum, precision, group_sum, precision);
+        }
+
+        // Hard-coded unrolled loop, to avoid having to load a dynamically
+        // sized array into the shader - and to optimize for the very common
+        // case of there only being a single intensity interval
+        GLSL("uint val; \n");
+        for (int i = 0; i < data->num_intensity_intervals[c]; i++) {
+            ident_t bounds = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_vec2("bounds"),
+                .data = &(float[2]) {
+                    data->intensity_interval_lower_bound[c][i] / 255.0,
+                    data->intensity_interval_upper_bound[c][i] / 255.0,
+                },
+            });
+
+            const uint8_t num_values = data->num_model_values[c];
+            uint8_t h = num_values > 1 ? data->comp_model_value[c][i][1] : 8;
+            uint8_t v = num_values > 2 ? data->comp_model_value[c][i][2] : h;
+            h = PL_CLAMP(h, 2, 14) - 2;
+            v = PL_CLAMP(v, 2, 14) - 2;
+            // FIXME: double h/v for subsampled planes!
+
+            // Reduce scale for chroma planes
+            int16_t scale = data->comp_model_value[c][i][0];
+            if (c > 0 && pl_color_system_is_ycbcr_like(params->repr->sys))
+                scale >>= 1;
+
+            pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+            ident_t values = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_uint("comp_model_value"),
+                .data = &(unsigned int) {
+                    (uint16_t) scale << 16 | h << 8 | v,
+                },
+            });
+
+            GLSL("if (avg >= "$".x && avg <= "$".y) \n"
+                 "    val = "$"; else               \n",
+                 bounds, bounds, values);
+        }
+        GLSL("    val = 0u; \n");
+
+        // Extract the grain parameters from comp_model_value
+        GLSL("uvec2 offset = uvec2((val & 0xFF00u) >> 2,    \n"
+             "                     (val & 0xFFu) << 6);     \n"
+             "float scale = "$" * float(int(val >> 16));    \n"
+             // Add randomness
+             "uint rand = pcg[%d];                          \n"
+             "offset.x += (rand >> 16u) %% 52u;             \n"
+             "offset.y += (rand & 0xFFFFu) %% 56u;          \n"
+             "offset.x &= 0xFFFCu;                          \n"
+             "offset.y &= 0xFFF8u;                          \n"
+             "if ((rand & 1u) == 1u) scale = -scale;        \n"
+             // Add local offset and compute grain
+             "offset += 8u * (gl_WorkGroupID.xy %% 2u);     \n"
+             "offset += gl_LocalInvocationID.xy;            \n"
+             "float grain = "$"(offset);                    \n"
+             "color[%d] += scale * grain;                   \n",
+             scale_factor, c, db, c);
+
+        // TODO: Deblocking?
+
+        GLSL("}\n");
+    }
+
+    GLSL("} \n");
+    return true;
+}
+
+// These tables are all taken from the SMPTE RDD 5-2006 specification
+static const int8_t Gaussian_LUT[2048+4] = {
+    -11, 12, 103, -11, 42, -35, 12, 59, 77, 98, -87, 3, 65, -78, 45, 56, -51, 21,
+    13, -11, -20, -19, 33, -127, 17, -6, -105, 18, 19, 71, 48, -10, -38, 42,
+    -2, 75, -67, 52, -90, 33, -47, 21, -3, -56, 49, 1, -57, -42, -1, 120, -127,
+    -108, -49, 9, 14, 127, 122, 109, 52, 127, 2, 7, 114, 19, 30, 12, 77, 112,
+    82, -61, -127, 111, -52, -29, 2, -49, -24, 58, -29, -73, 12, 112, 67, 79,
+    -3, -114, -87, -6, -5, 40, 58, -81, 49, -27, -31, -34, -105, 50, 16, -24,
+    -35, -14, -15, -127, -55, -22, -55, -127, -112, 5, -26, -72, 127, 127, -2,
+    41, 87, -65, -16, 55, 19, 91, -81, -65, -64, 35, -7, -54, 99, -7, 88, 125,
+    -26, 91, 0, 63, 60, -14, -23, 113, -33, 116, 14, 26, 51, -16, 107, -8, 53,
+    38, -34, 17, -7, 4, -91, 6, 63, 63, -15, 39, -36, 19, 55, 17, -51, 40, 33,
+    -37, 126, -39, -118, 17, -30, 0, 19, 98, 60, 101, -12, -73, -17, -52, 98,
+    3, 3, 60, 33, -3, -2, 10, -42, -106, -38, 14, 127, 16, -127, -31, -86, -39,
+    -56, 46, -41, 75, 23, -19, -22, -70, 74, -54, -2, 32, -45, 17, -92, 59,
+    -64, -67, 56, -102, -29, -87, -34, -92, 68, 5, -74, -61, 93, -43, 14, -26,
+    -38, -126, -17, 16, -127, 64, 34, 31, 93, 17, -51, -59, 71, 77, 81, 127,
+    127, 61, 33, -106, -93, 0, 0, 75, -69, 71, 127, -19, -111, 30, 23, 15, 2,
+    39, 92, 5, 42, 2, -6, 38, 15, 114, -30, -37, 50, 44, 106, 27, 119, 7, -80,
+    25, -68, -21, 92, -11, -1, 18, 41, -50, 79, -127, -43, 127, 18, 11, -21,
+    32, -52, 27, -88, -90, -39, -19, -10, 24, -118, 72, -24, -44, 2, 12, 86,
+    -107, 39, -33, -127, 47, 51, -24, -22, 46, 0, 15, -35, -69, -2, -74, 24,
+    -6, 0, 29, -3, 45, 32, -32, 117, -45, 79, -24, -17, -109, -10, -70, 88,
+    -48, 24, -91, 120, -37, 50, -127, 58, 32, -82, -10, -17, -7, 46, -127, -15,
+    89, 127, 17, 98, -39, -33, 37, 42, -40, -32, -21, 105, -19, 19, 19, -59,
+    -9, 30, 0, -127, 34, 127, -84, 75, 24, -40, -49, -127, -107, -14, 45, -75,
+    1, 30, -20, 41, -68, -40, 12, 127, -3, 5, 20, -73, -59, -127, -3, -3, -53,
+    -6, -119, 93, 120, -80, -50, 0, 20, -46, 67, 78, -12, -22, -127, 36, -41,
+    56, 119, -5, -116, -22, 68, -14, -90, 24, -82, -44, -127, 107, -25, -37,
+    40, -7, -7, -82, 5, -87, 44, -34, 9, -127, 39, 70, 49, -63, 74, -49, 109,
+    -27, -89, -47, -39, 44, 49, -4, 60, -42, 80, 9, -127, -9, -56, -49, 125,
+    -66, 47, 36, 117, 15, -11, -96, 109, 94, -17, -56, 70, 8, -14, -5, 50, 37,
+    -45, 120, -30, -76, 40, -46, 6, 3, 69, 17, -78, 1, -79, 6, 127, 43, 26,
+    127, -127, 28, -55, -26, 55, 112, 48, 107, -1, -77, -1, 53, -9, -22, -43,
+    123, 108, 127, 102, 68, 46, 5, 1, 123, -13, -55, -34, -49, 89, 65, -105,
+    -5, 94, -53, 62, 45, 30, 46, 18, -35, 15, 41, 47, -98, -24, 94, -75, 127,
+    -114, 127, -68, 1, -17, 51, -95, 47, 12, 34, -45, -75, 89, -107, -9, -58,
+    -29, -109, -24, 127, -61, -13, 77, -45, 17, 19, 83, -24, 9, 127, -66, 54,
+    4, 26, 13, 111, 43, -113, -22, 10, -24, 83, 67, -14, 75, -123, 59, 127,
+    -12, 99, -19, 64, -38, 54, 9, 7, 61, -56, 3, -57, 113, -104, -59, 3, -9,
+    -47, 74, 85, -55, -34, 12, 118, 28, 93, -72, 13, -99, -72, -20, 30, 72,
+    -94, 19, -54, 64, -12, -63, -25, 65, 72, -10, 127, 0, -127, 103, -20, -73,
+    -112, -103, -6, 28, -42, -21, -59, -29, -26, 19, -4, -51, 94, -58, -95,
+    -37, 35, 20, -69, 127, -19, -127, -22, -120, -53, 37, 74, -127, -1, -12,
+    -119, -53, -28, 38, 69, 17, 16, -114, 89, 62, 24, 37, -23, 49, -101, -32,
+    -9, -95, -53, 5, 93, -23, -49, -8, 51, 3, -75, -90, -10, -39, 127, -86,
+    -22, 20, 20, 113, 75, 52, -31, 92, -63, 7, -12, 46, 36, 101, -43, -17, -53,
+    -7, -38, -76, -31, -21, 62, 31, 62, 20, -127, 31, 64, 36, 102, -85, -10,
+    77, 80, 58, -79, -8, 35, 8, 80, -24, -9, 3, -17, 72, 127, 83, -87, 55, 18,
+    -119, -123, 36, 10, 127, 56, -55, 113, 13, 26, 32, -13, -48, 22, -13, 5,
+    58, 27, 24, 26, -11, -36, 37, -92, 78, 81, 9, 51, 14, 67, -13, 0, 32, 45,
+    -76, 32, -39, -22, -49, -127, -27, 31, -9, 36, 14, 71, 13, 57, 12, -53,
+    -86, 53, -44, -35, 2, 127, 12, -66, -44, 46, -115, 3, 10, 56, -35, 119,
+    -19, -61, 52, -59, -127, -49, -23, 4, -5, 17, -82, -6, 127, 25, 79, 67, 64,
+    -25, 14, -64, -37, -127, -28, 21, -63, 66, -53, -41, 109, -62, 15, -22, 13,
+    29, -63, 20, 27, 95, -44, -59, -116, -10, 79, -49, 22, -43, -16, 46, -47,
+    -120, -36, -29, -52, -44, 29, 127, -13, 49, -9, -127, 75, -28, -23, 88, 59,
+    11, -95, 81, -59, 58, 60, -26, 40, -92, -3, -22, -58, -45, -59, -22, -53,
+    71, -29, 66, -32, -23, 14, -17, -66, -24, -28, -62, 47, 38, 17, 16, -37,
+    -24, -11, 8, -27, -19, 59, 45, -49, -47, -4, -22, -81, 30, -67, -127, 74,
+    102, 5, -18, 98, 34, -66, 42, -52, 7, -59, 24, -58, -19, -24, -118, -73,
+    91, 15, -16, 79, -32, -79, -127, -36, 41, 77, -83, 2, 56, 22, -75, 127,
+    -16, -21, 12, 31, 56, -113, -127, 90, 55, 61, 12, 55, -14, -113, -14, 32,
+    49, -67, -17, 91, -10, 1, 21, 69, -70, 99, -19, -112, 66, -90, -10, -9,
+    -71, 127, 50, -81, -49, 24, 61, -61, -111, 7, -41, 127, 88, -66, 108, -127,
+    -6, 36, -14, 41, -50, 14, 14, 73, -101, -28, 77, 127, -8, -100, 88, 38,
+    121, 88, -125, -60, 13, -94, -115, 20, -67, -87, -94, -119, 44, -28, -30,
+    18, 5, -53, -61, 20, -43, 11, -77, -60, 13, 29, 3, 6, -72, 38, -60, -11,
+    108, -53, 41, 66, -12, -127, -127, -49, 24, 29, 46, 36, 91, 34, -33, 116,
+    -51, -34, -52, 91, 7, -83, 73, -26, -103, 24, -10, 76, 84, 5, 68, -80, -13,
+    -17, -32, -48, 20, 50, 26, 10, 63, -104, -14, 37, 127, 114, 97, 35, 1, -33,
+    -55, 127, -124, -33, 61, -7, 119, -32, -127, -53, -42, 63, 3, -5, -26, 70,
+    -58, -33, -44, -43, 34, -56, -127, 127, 25, -35, -11, 16, -81, 29, -58, 40,
+    -127, -127, 20, -47, -11, -36, -63, -52, -32, -82, 78, -76, -73, 8, 27,
+    -72, -9, -74, -85, -86, -57, 25, 78, -10, -97, 35, -65, 8, -59, 14, 1, -42,
+    32, -88, -44, 17, -3, -9, 59, 40, 12, -108, -40, 24, 34, 18, -28, 2, 51,
+    -110, -4, 100, 1, 65, 22, 0, 127, 61, 45, 25, -31, 6, 9, -7, -48, 99, 16,
+    44, -2, -40, 32, -39, -52, 10, -110, -19, 56, -127, 69, 26, 51, 92, 40, 61,
+    -52, 45, -38, 13, 85, 122, 27, 66, 45, -111, -83, -3, 31, 37, 19, -36, 58,
+    71, 39, -78, -47, 58, -78, 8, -62, -36, -14, 61, 42, -127, 71, -4, 24, -54,
+    52, -127, 67, -4, -42, 30, -63, 59, -3, -1, -18, -46, -92, -81, -96, -14,
+    -53, -10, -11, -77, 13, 1, 8, -67, -127, 127, -28, 26, -14, 18, -13, -26,
+    2, 10, -46, -32, -15, 27, -31, -59, 59, 77, -121, 28, 40, -54, -62, -31,
+    -21, -37, -32, -6, -127, -25, -60, 70, -127, 112, -127, 127, 88, -7, 116,
+    110, 53, 87, -127, 3, 16, 23, 74, -106, -51, 3, 74, -82, -112, -74, 65, 81,
+    25, 53, 127, -45, -50, -103, -41, -65, -29, 79, -67, 64, -33, -30, -8, 127,
+    0, -13, -51, 67, -14, 5, -92, 29, -35, -8, -90, -57, -3, 36, 43, 44, -31,
+    -69, -7, 36, 39, -51, 43, -81, 58, 6, 127, 12, 57, 66, 46, 59, -43, -42,
+    41, -15, -120, 24, 3, -11, 19, -13, 51, 28, 3, 55, -48, -12, -1, 2, 97,
+    -19, 29, 42, 13, 43, 78, -44, 56, -108, -43, -19, 127, 15, -11, -18, -81,
+    83, -37, 77, -109, 15, 65, -50, 43, 12, 13, 27, 28, 61, 57, 30, 26, 106,
+    -18, 56, 13, 97, 4, -8, -62, -103, 94, 108, -44, 52, 27, -47, -9, 105, -53,
+    46, 89, 103, -33, 38, -34, 55, 51, 70, -94, -35, -87, -107, -19, -31, 9,
+    -19, 79, -14, 77, 5, -19, -107, 85, 21, -45, -39, -42, 9, -29, 74, 47, -75,
+    60, -127, 120, -112, -57, -32, 41, 7, 79, 76, 66, 57, 41, -25, 31, 37, -47,
+    -36, 43, -73, -37, 63, 127, -69, -52, 90, -33, -61, 60, -55, 44, 15, 4,
+    -67, 13, -92, 64, 29, -39, -3, 83, -2, -38, -85, -86, 58, 35, -69, -61, 29,
+    -37, -95, -78, 4, 30, -4, -32, -80, -22, -9, -77, 46, 7, -93, -71, 65, 9,
+    -50, 127, -70, 26, -12, -39, -114, 63, -127, -100, 4, -32, 111, 22, -60,
+    65, -101, 26, -42, 21, -59, -27, -74, 2, -94, 6, 126, 5, 76, -88, -9, -43,
+    -101, 127, 1, 125, 92, -63, 52, 56, 4, 81, -127, 127, 80, 127, -29, 30,
+    116, -74, -17, -57, 105, 48, 45, 25, -72, 48, -38, -108, 31, -34, 4, -11,
+    41, -127, 52, -104, -43, -37, 52, 2, 47, 87, -9, 77, 27, -41, -25, 90, 86,
+    -56, 75, 10, 33, 78, 58, 127, 127, -7, -73, 49, -33, -106, -35, 38, 57, 53,
+    -17, -4, 83, 52, -108, 54, -125, 28, 23, 56, -43, -88, -17, -6, 47, 23, -9,
+    0, -13, 111, 75, 27, -52, -38, -34, 39, 30, 66, 39, 38, -64, 38, 3, 21,
+    -32, -51, -28, 54, -38, -87, 20, 52, 115, 18, -81, -70, 0, -14, -46, -46,
+    -3, 125, 16, -14, 23, -82, -84, -69, -20, -65, -127, 9, 81, -49, 61, 7,
+    -36, -45, -42, 57, -26, 47, 20, -85, 46, -13, 41, -37, -75, -60, 86, -78,
+    -127, 12, 50, 2, -3, 13, 47, 5, 19, -78, -55, -27, 65, -71, 12, -108, 20,
+    -16, 11, -31, 63, -55, 37, 75, -17, 127, -73, -33, -28, -120, 105, 68, 106,
+    -103, -106, 71, 61, 2, 23, -3, 33, -5, -15, -67, -15, -23, -54, 15, -63,
+    76, 58, -110, 1, 83, -27, 22, 75, -39, -17, -11, 64, -17, -127, -54, -66,
+    31, 96, 116, 3, -114, -7, -108, -63, 97, 9, 50, 8, 75, -28, 72, 112, -36,
+    -112, 95, -50, 23, -13, -19, 55, 21, 23, 92, 91, 22, -49, 16, -75, 23, 9,
+    -49, -97, -37, 49, -36, 36, -127, -86, 43, 127, -24, -24, 84, 83, -35, -34,
+    -12, 109, 102, -38, 51, -68, 34, 19, -22, 49, -32, 127, 40, 24, -93, -4,
+    -3, 105, 3, -58, -18, 8, 127, -18, 125, 68, 69, -62, 30, -36, 54, -57, -24,
+    17, 43, -36, -27, -57, -67, -21, -10, -49, 68, 12, 65, 4, 48, 55, 127, -75,
+    44, 89, -66, -13, -78, -82, -91, 22, 30, 33, -40, -87, -34, 96, -91, 39,
+    10, -64, -3, -12, 127, -50, -37, -56, 23, -35, -36, -54, 90, -91, 2, 50,
+    77, -6, -127, 16, 46, -5, -73, 0, -56, -18, -72, 28, 93, 60, 49, 20, 18,
+    111, -111, 32, -83, 47, 47, -10, 35, -88, 43, 57, -98, 127, -17, 0, 1, -39,
+    -127, -2, 0, 63, 93, 0, 36, -66, -61, -19, 39, -127, 58, 50, -17, 127, 88,
+    -43, -108, -51, -16, 7, -36, 68, 46, -14, 107, 40, 57, 7, 19, 8, 3, 88,
+    -90, -92, -18, -21, -24, 13, 7, -4, -78, -91, -4, 8, -35, -5, 19, 2, -111,
+    4, -66, -81, 122, -20, -34, -37, -84, 127, 68, 46, 17, 47,
+
+    // Repeat the beginning of the array to allow wrapping reads
+    -11, 12, 103, -11,
+};
+
+static const uint32_t Seed_LUT[256] = {
+    747538460, 1088979410, 1744950180, 1767011913, 1403382928,
+    521866116, 1060417601, 2110622736, 1557184770, 105289385, 585624216,
+    1827676546, 1191843873, 1018104344, 1123590530, 663361569, 2023850500,
+    76561770, 1226763489, 80325252, 1992581442, 502705249, 740409860,
+    516219202, 557974537, 1883843076, 720112066, 1640137737, 1820967556,
+    40667586, 155354121, 1820967557, 1115949072, 1631803309, 98284748,
+    287433856, 2119719977, 988742797, 1827432592, 579378475, 1017745956,
+    1309377032, 1316535465, 2074315269, 1923385360, 209722667, 1546228260,
+    168102420, 135274561, 355958469, 248291472, 2127839491, 146920100,
+    585982612, 1611702337, 696506029, 1386498192, 1258072451, 1212240548,
+    1043171860, 1217404993, 1090770605, 1386498193, 169093201, 541098240,
+    1468005469, 456510673, 1578687785, 1838217424, 2010752065, 2089828354,
+    1362717428, 970073673, 854129835, 714793201, 1266069081, 1047060864,
+    1991471829, 1098097741, 913883585, 1669598224, 1337918685, 1219264706,
+    1799741108, 1834116681, 683417731, 1120274457, 1073098457, 1648396544,
+    176642749, 31171789, 718317889, 1266977808, 1400892508, 549749008,
+    1808010512, 67112961, 1005669825, 903663673, 1771104465, 1277749632,
+    1229754427, 950632997, 1979371465, 2074373264, 305357524, 1049387408,
+    1171033360, 1686114305, 2147468765, 1941195985, 117709841, 809550080,
+    991480851, 1816248997, 1561503561, 329575568, 780651196, 1659144592,
+    1910793616, 604016641, 1665084765, 1530186961, 1870928913, 809550081,
+    2079346113, 71307521, 876663040, 1073807360, 832356664, 1573927377,
+    204073344, 2026918147, 1702476788, 2043881033, 57949587, 2001393952,
+    1197426649, 1186508931, 332056865, 950043140, 890043474, 349099312,
+    148914948, 236204097, 2022643605, 1441981517, 498130129, 1443421481,
+    924216797, 1817491777, 1913146664, 1411989632, 929068432, 495735097,
+    1684636033, 1284520017, 432816184, 1344884865, 210843729, 676364544,
+    234449232, 12112337, 1350619139, 1753272996, 2037118872, 1408560528,
+    533334916, 1043640385, 357326099, 201376421, 110375493, 541106497,
+    416159637, 242512193, 777294080, 1614872576, 1535546636, 870600145,
+    910810409, 1821440209, 1605432464, 1145147393, 951695441, 1758494976,
+    1506656568, 1557150160, 608221521, 1073840384, 217672017, 684818688,
+    1750138880, 16777217, 677990609, 953274371, 1770050213, 1359128393,
+    1797602707, 1984616737, 1865815816, 2120835200, 2051677060, 1772234061,
+    1579794881, 1652821009, 1742099468, 1887260865, 46468113, 1011925248,
+    1134107920, 881643832, 1354774993, 472508800, 1892499769, 1752793472,
+    1962502272, 687898625, 883538000, 1354355153, 1761673473, 944820481,
+    2020102353, 22020353, 961597696, 1342242816, 964808962, 1355809701,
+    17016649, 1386540177, 647682692, 1849012289, 751668241, 1557184768,
+    127374604, 1927564752, 1045744913, 1614921984, 43588881, 1016185088,
+    1544617984, 1090519041, 136122424, 215038417, 1563027841, 2026918145,
+    1688778833, 701530369, 1372639488, 1342242817, 2036945104, 953274369,
+    1750192384, 16842753, 964808960, 1359020032, 1358954497
+};
+
+// Note: This is pre-transposed, i.e. stored column-major order
+static const int8_t R64T[64][64] = {
+    {
+         32,  45,  45,  45,  45,  45,  45,  45,  44,  44,  44,  44,  43,  43,  43,  42,
+         42,  41,  41,  40,  40,  39,  39,  38,  38,  37,  36,  36,  35,  34,  34,  33,
+         32,  31,  30,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,
+         17,  16,  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   4,   3,   2,   1,
+    }, {
+         32,  45,  45,  44,  43,  42,  41,  39,  38,  36,  34,  31,  29,  26,  23,  20,
+         17,  14,  11,   8,   4,   1,  -2,  -6,  -9, -12, -15, -18, -21, -24, -27, -30,
+        -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43,
+        -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10,  -7,  -3,
+    }, {
+         32,  45,  44,  42,  40,  37,  34,  30,  25,  20,  15,  10,   4,  -1,  -7, -12,
+        -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36,
+        -32, -28, -23, -18, -13,  -8,  -2,   3,   9,  14,  19,  24,  29,  33,  36,  39,
+         42,  44,  45,  45,  45,  44,  43,  40,  38,  34,  30,  26,  21,  16,  11,   6,
+    }, {
+         32,  45,  43,  39,  35,  30,  23,  16,   9,   1,  -7, -14, -21, -28, -34, -38,
+        -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11,  -3,   4,  12,  19,  26,
+         32,  37,  41,  44,  45,  45,  44,  41,  38,  33,  27,  20,  13,   6,  -2, -10,
+        -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15,  -8,
+    }, {
+         32,  44,  41,  36,  29,  20,  11,   1,  -9, -18, -27, -34, -40, -44, -45, -45,
+        -42, -37, -30, -22, -13,  -3,   7,  16,  25,  33,  39,  43,  45,  45,  43,  38,
+         32,  24,  15,   6,  -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26,
+        -17,  -8,   2,  12,  21,  30,  36,  41,  44,  45,  44,  40,  35,  28,  19,  10,
+    }, {
+         32,  44,  39,  31,  21,  10,  -2, -14, -25, -34, -41, -45, -45, -42, -36, -28,
+        -17,  -6,   7,  18,  29,  37,  43,  45,  44,  40,  34,  24,  13,   1, -11, -22,
+        -32, -39, -44, -45, -43, -38, -30, -20,  -9,   3,  15,  26,  35,  41,  45,  45,
+         42,  36,  27,  16,   4,  -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12,
+    }, {
+         32,  43,  36,  26,  13,  -1, -15, -28, -38, -44, -45, -42, -35, -24, -11,   3,
+         17,  30,  39,  44,  45,  41,  34,  22,   9,  -6, -19, -31, -40, -45, -45, -40,
+        -32, -20,  -7,   8,  21,  33,  41,  45,  44,  39,  30,  18,   4, -10, -23, -34,
+        -42, -45, -44, -38, -29, -16,  -2,  12,  25,  36,  43,  45,  43,  37,  27,  14,
+    }, {
+         32,  42,  34,  20,   4, -12, -27, -38, -44, -45, -39, -28, -13,   3,  19,  33,
+         42,  45,  43,  34,  21,   6, -11, -26, -38, -44, -45, -39, -29, -14,   2,  18,
+         32,  41,  45,  43,  35,  22,   7, -10, -25, -37, -44, -45, -40, -30, -15,   1,
+         17,  31,  41,  45,  43,  36,  23,   8,  -9, -24, -36, -44, -45, -40, -30, -16,
+    }, {
+         32,  41,  30,  14,  -4, -22, -36, -44, -44, -37, -23,  -6,  13,  30,  41,  45,
+         42,  31,  15,  -3, -21, -36, -44, -45, -38, -24,  -7,  12,  29,  40,  45,  42,
+         32,  16,  -2, -20, -35, -44, -45, -38, -25,  -8,  11,  28,  40,  45,  43,  33,
+         17,  -1, -19, -34, -43, -45, -39, -26,  -9,  10,  27,  39,  45,  43,  34,  18,
+    }, {
+         32,  40,  27,   8, -13, -31, -43, -45, -38, -22,  -2,  18,  35,  44,  44,  34,
+         17,  -3, -23, -38, -45, -42, -30, -12,   9,  28,  41,  45,  40,  26,   7, -14,
+        -32, -43, -45, -37, -21,  -1,  19,  36,  44,  44,  34,  16,  -4, -24, -39, -45,
+        -42, -30, -11,  10,  29,  41,  45,  39,  25,   6, -15, -33, -43, -45, -36, -20,
+    }, {
+         32,  39,  23,   1, -21, -38, -45, -40, -25,  -3,  19,  37,  45,  41,  27,   6,
+        -17, -36, -45, -42, -29,  -8,  15,  34,  44,  43,  30,  10, -13, -33, -44, -44,
+        -32, -12,  11,  31,  43,  44,  34,  14,  -9, -30, -43, -45, -35, -16,   7,  28,
+         42,  45,  36,  18,  -4, -26, -41, -45, -38, -20,   2,  24,  40,  45,  39,  22,
+    }, {
+         32,  38,  19,  -6, -29, -43, -44, -31,  -9,  16,  36,  45,  40,  22,  -2, -26,
+        -42, -45, -34, -12,  13,  34,  45,  41,  25,   1, -23, -40, -45, -36, -15,  10,
+         32,  44,  43,  28,   4, -20, -39, -45, -38, -18,   7,  30,  43,  44,  30,   8,
+        -17, -37, -45, -39, -21,   3,  27,  42,  44,  33,  11, -14, -35, -45, -41, -24,
+    }, {
+         32,  37,  15, -12, -35, -45, -39, -18,   9,  33,  45,  40,  21,  -6, -30, -44,
+        -42, -24,   2,  28,  43,  43,  27,   1, -25, -42, -44, -30,  -4,  22,  41,  45,
+         32,   8, -19, -39, -45, -34, -11,  16,  38,  45,  36,  14, -13, -36, -45, -38,
+        -17,  10,  34,  45,  40,  20,  -7, -31, -44, -41, -23,   3,  29,  44,  43,  26,
+    }, {
+         32,  36,  11, -18, -40, -45, -30,  -3,  25,  43,  43,  24,  -4, -31, -45, -39,
+        -17,  12,  36,  45,  35,  10, -19, -40, -44, -30,  -2,  26,  43,  42,  23,  -6,
+        -32, -45, -39, -16,  13,  37,  45,  34,   9, -20, -41, -44, -29,  -1,  27,  44,
+         42,  22,  -7, -33, -45, -38, -15,  14,  38,  45,  34,   8, -21, -41, -44, -28,
+    }, {
+         32,  34,   7, -24, -43, -41, -19,  12,  38,  45,  30,   1, -29, -45, -39, -14,
+         17,  40,  44,  26,  -4, -33, -45, -36,  -9,  22,  43,  42,  21, -10, -36, -45,
+        -32,  -3,  27,  44,  40,  16, -15, -39, -44, -28,   2,  31,  45,  37,  11, -20,
+        -42, -43, -23,   8,  35,  45,  34,   6, -25, -44, -41, -18,  13,  38,  45,  30,
+    }, {
+         32,  33,   2, -30, -45, -36,  -7,  26,  44,  38,  11, -22, -43, -40, -15,  18,
+         42,  42,  19, -14, -40, -44, -23,  10,  38,  45,  27,  -6, -35, -45, -30,   1,
+         32,  45,  34,   3, -29, -45, -36,  -8,  25,  44,  39,  12, -21, -43, -41, -16,
+         17,  41,  43,  20, -13, -39, -44, -24,   9,  37,  45,  28,  -4, -34, -45, -31,
+    }, {
+         32,  31,  -2, -34, -45, -28,   7,  37,  44,  24, -11, -39, -43, -20,  15,  41,
+         42,  16, -19, -43, -40, -12,  23,  44,  38,   8, -27, -45, -35,  -3,  30,  45,
+         32,  -1, -34, -45, -29,   6,  36,  45,  25, -10, -39, -44, -21,  14,  41,  42,
+         17, -18, -43, -40, -13,  22,  44,  38,   9, -26, -45, -36,  -4,  30,  45,  33,
+    }, {
+         32,  30,  -7, -38, -43, -18,  19,  44,  38,   6, -30, -45, -29,   8,  39,  43,
+         17, -20, -44, -37,  -4,  31,  45,  28,  -9, -39, -43, -16,  21,  44,  36,   3,
+        -32, -45, -27,  10,  40,  42,  15, -22, -44, -36,  -2,  33,  45,  26, -11, -40,
+        -42, -14,  23,  45,  35,   1, -34, -45, -25,  12,  41,  41,  13, -24, -45, -34,
+    }, {
+         32,  28, -11, -41, -40,  -8,  30,  45,  25, -14, -43, -38,  -4,  33,  45,  22,
+        -17, -44, -36,  -1,  35,  44,  19, -20, -44, -34,   2,  37,  43,  16, -23, -45,
+        -32,   6,  39,  42,  13, -26, -45, -30,   9,  40,  41,  10, -29, -45, -27,  12,
+         42,  39,   7, -31, -45, -24,  15,  43,  38,   3, -34, -45, -21,  18,  44,  36,
+    }, {
+         32,  26, -15, -44, -35,   3,  39,  41,   9, -31, -45, -20,  21,  45,  30, -10,
+        -42, -38,  -2,  36,  43,  14, -27, -45, -25,  16,  44,  34,  -4, -39, -41,  -8,
+         32,  45,  19, -22, -45, -30,  11,  42,  38,   1, -36, -43, -13,  28,  45,  24,
+        -17, -44, -34,   6,  40,  40,   7, -33, -44, -18,  23,  45,  29, -12, -43, -37,
+    }, {
+         32,  24, -19, -45, -29,  14,  44,  33,  -9, -42, -36,   3,  40,  39,   2, -37,
+        -42,  -8,  34,  44,  13, -30, -45, -18,  25,  45,  23, -20, -45, -28,  15,  44,
+         32, -10, -43, -36,   4,  40,  39,   1, -38, -41,  -7,  34,  43,  12, -30, -45,
+        -17,  26,  45,  22, -21, -45, -27,  16,  44,  31, -11, -43, -35,   6,  41,  38,
+    }, {
+         32,  22, -23, -45, -21,  24,  45,  20, -25, -45, -19,  26,  45,  18, -27, -45,
+        -17,  28,  45,  16, -29, -45, -15,  30,  44,  14, -30, -44, -13,  31,  44,  12,
+        -32, -44, -11,  33,  43,  10, -34, -43,  -9,  34,  43,   8, -35, -42,  -7,  36,
+         42,   6, -36, -41,  -4,  37,  41,   3, -38, -40,  -2,  38,  40,   1, -39, -39,
+    }, {
+         32,  20, -27, -45, -13,  33,  43,   6, -38, -39,   2,  41,  35, -10, -44, -30,
+         17,  45,  23, -24, -45, -16,  30,  44,   9, -36, -41,  -1,  40,  37,  -7, -43,
+        -32,  14,  45,  26, -21, -45, -19,  28,  44,  12, -34, -42,  -4,  38,  39,  -3,
+        -42, -34,  11,  44,  29, -18, -45, -22,  25,  45,  15, -31, -43,  -8,  36,  40,
+    }, {
+         32,  18, -30, -43,  -4,  39,  36, -10, -44, -26,  23,  45,  13, -34, -41,   1,
+         42,  33, -15, -45, -21,  28,  44,   8, -38, -38,   7,  44,  29, -20, -45, -16,
+         32,  42,   2, -40, -35,  12,  45,  24, -25, -45, -11,  36,  40,  -3, -43, -31,
+         17,  45,  19, -30, -43,  -6,  39,  37,  -9, -44, -27,  22,  45,  14, -34, -41,
+    }, {
+         32,  16, -34, -40,   4,  44,  27, -24, -44,  -8,  39,  36, -13, -45, -19,  31,
+         42,  -1, -43, -30,  21,  45,  11, -37, -38,  10,  45,  22, -29, -43,  -2,  41,
+         32, -18, -45, -14,  35,  39,  -7, -44, -25,  26,  44,   6, -40, -34,  15,  45,
+         17, -33, -41,   3,  43,  28, -23, -45,  -9,  38,  36, -12, -45, -20,  30,  42,
+    }, {
+         32,  14, -36, -37,  13,  45,  15, -36, -38,  12,  45,  16, -35, -38,  11,  45,
+         17, -34, -39,  10,  45,  18, -34, -39,   9,  45,  19, -33, -40,   8,  45,  20,
+        -32, -40,   7,  45,  21, -31, -41,   6,  44,  22, -30, -41,   4,  44,  23, -30,
+        -42,   3,  44,  24, -29, -42,   2,  44,  25, -28, -43,   1,  43,  26, -27, -43,
+    }, {
+         32,  12, -39, -33,  21,  44,   2, -43, -25,  30,  41,  -8, -45, -16,  36,  36,
+        -17, -45,  -7,  41,  29, -26, -43,   3,  44,  20, -34, -38,  13,  45,  11, -39,
+        -32,  22,  44,   1, -43, -24,  30,  40,  -9, -45, -15,  37,  35, -18, -45,  -6,
+         42,  28, -27, -42,   4,  45,  19, -34, -38,  14,  45,  10, -40, -31,  23,  44,
+    }, {
+         32,  10, -41, -28,  29,  40, -11, -45,  -9,  41,  27, -30, -40,  12,  45,   8,
+        -42, -26,  30,  39, -13, -45,  -7,  42,  25, -31, -39,  14,  45,   6, -43, -24,
+         32,  38, -15, -45,  -4,  43,  23, -33, -38,  16,  45,   3, -43, -22,  34,  37,
+        -17, -45,  -2,  44,  21, -34, -36,  18,  44,   1, -44, -20,  35,  36, -19, -44,
+    }, {
+         32,   8, -43, -22,  35,  34, -23, -42,   9,  45,   7, -43, -21,  36,  34, -24,
+        -42,  10,  45,   6, -43, -20,  36,  33, -25, -41,  11,  45,   4, -44, -19,  37,
+         32, -26, -41,  12,  45,   3, -44, -18,  38,  31, -27, -40,  13,  45,   2, -44,
+        -17,  38,  30, -28, -40,  14,  45,   1, -44, -16,  39,  30, -29, -39,  15,  45,
+    }, {
+         32,   6, -44, -16,  40,  26, -34, -34,  25,  40, -15, -44,   4,  45,   7, -44,
+        -17,  39,  27, -33, -35,  24,  41, -14, -44,   3,  45,   8, -43, -18,  39,  28,
+        -32, -36,  23,  41, -13, -45,   2,  45,   9, -43, -19,  38,  29, -31, -36,  22,
+         42, -12, -45,   1,  45,  10, -43, -20,  38,  30, -30, -37,  21,  42, -11, -45,
+    }, {
+         32,   3, -45, -10,  43,  16, -41, -22,  38,  28, -34, -33,  29,  37, -23, -40,
+         17,  43, -11, -45,   4,  45,   2, -45,  -9,  44,  15, -41, -21,  38,  27, -34,
+        -32,  30,  36, -24, -40,  18,  43, -12, -44,   6,  45,   1, -45,  -8,  44,  14,
+        -42, -20,  39,  26, -35, -31,  30,  36, -25, -39,  19,  42, -13, -44,   7,  45,
+    }, {
+         32,   1, -45,  -3,  45,   6, -45,  -8,  44,  10, -44, -12,  43,  14, -43, -16,
+         42,  18, -41, -20,  40,  22, -39, -24,  38,  26, -36, -28,  35,  30, -34, -31,
+         32,  33, -30, -34,  29,  36, -27, -37,  25,  38, -23, -39,  21,  40, -19, -41,
+         17,  42, -15, -43,  13,  44, -11, -44,   9,  45,  -7, -45,   4,  45,  -2, -45,
+    }, {
+         32,  -1, -45,   3,  45,  -6, -45,   8,  44, -10, -44,  12,  43, -14, -43,  16,
+         42, -18, -41,  20,  40, -22, -39,  24,  38, -26, -36,  28,  35, -30, -34,  31,
+         32, -33, -30,  34,  29, -36, -27,  37,  25, -38, -23,  39,  21, -40, -19,  41,
+         17, -42, -15,  43,  13, -44, -11,  44,   9, -45,  -7,  45,   4, -45,  -2,  45,
+    }, {
+         32,  -3, -45,  10,  43, -16, -41,  22,  38, -28, -34,  33,  29, -37, -23,  40,
+         17, -43, -11,  45,   4, -45,   2,  45,  -9, -44,  15,  41, -21, -38,  27,  34,
+        -32, -30,  36,  24, -40, -18,  43,  12, -44,  -6,  45,  -1, -45,   8,  44, -14,
+        -42,  20,  39, -26, -35,  31,  30, -36, -25,  39,  19, -42, -13,  44,   7, -45,
+    }, {
+         32,  -6, -44,  16,  40, -26, -34,  34,  25, -40, -15,  44,   4, -45,   7,  44,
+        -17, -39,  27,  33, -35, -24,  41,  14, -44,  -3,  45,  -8, -43,  18,  39, -28,
+        -32,  36,  23, -41, -13,  45,   2, -45,   9,  43, -19, -38,  29,  31, -36, -22,
+         42,  12, -45,  -1,  45, -10, -43,  20,  38, -30, -30,  37,  21, -42, -11,  45,
+    }, {
+         32,  -8, -43,  22,  35, -34, -23,  42,   9, -45,   7,  43, -21, -36,  34,  24,
+        -42, -10,  45,  -6, -43,  20,  36, -33, -25,  41,  11, -45,   4,  44, -19, -37,
+         32,  26, -41, -12,  45,  -3, -44,  18,  38, -31, -27,  40,  13, -45,   2,  44,
+        -17, -38,  30,  28, -40, -14,  45,  -1, -44,  16,  39, -30, -29,  39,  15, -45,
+    }, {
+         32, -10, -41,  28,  29, -40, -11,  45,  -9, -41,  27,  30, -40, -12,  45,  -8,
+        -42,  26,  30, -39, -13,  45,  -7, -42,  25,  31, -39, -14,  45,  -6, -43,  24,
+         32, -38, -15,  45,  -4, -43,  23,  33, -38, -16,  45,  -3, -43,  22,  34, -37,
+        -17,  45,  -2, -44,  21,  34, -36, -18,  44,  -1, -44,  20,  35, -36, -19,  44,
+    }, {
+         32, -12, -39,  33,  21, -44,   2,  43, -25, -30,  41,   8, -45,  16,  36, -36,
+        -17,  45,  -7, -41,  29,  26, -43,  -3,  44, -20, -34,  38,  13, -45,  11,  39,
+        -32, -22,  44,  -1, -43,  24,  30, -40,  -9,  45, -15, -37,  35,  18, -45,   6,
+         42, -28, -27,  42,   4, -45,  19,  34, -38, -14,  45, -10, -40,  31,  23, -44,
+    }, {
+         32, -14, -36,  37,  13, -45,  15,  36, -38, -12,  45, -16, -35,  38,  11, -45,
+         17,  34, -39, -10,  45, -18, -34,  39,   9, -45,  19,  33, -40,  -8,  45, -20,
+        -32,  40,   7, -45,  21,  31, -41,  -6,  44, -22, -30,  41,   4, -44,  23,  30,
+        -42,  -3,  44, -24, -29,  42,   2, -44,  25,  28, -43,  -1,  43, -26, -27,  43,
+    }, {
+         32, -16, -34,  40,   4, -44,  27,  24, -44,   8,  39, -36, -13,  45, -19, -31,
+         42,   1, -43,  30,  21, -45,  11,  37, -38, -10,  45, -22, -29,  43,  -2, -41,
+         32,  18, -45,  14,  35, -39,  -7,  44, -25, -26,  44,  -6, -40,  34,  15, -45,
+         17,  33, -41,  -3,  43, -28, -23,  45,  -9, -38,  36,  12, -45,  20,  30, -42,
+    }, {
+         32, -18, -30,  43,  -4, -39,  36,  10, -44,  26,  23, -45,  13,  34, -41,  -1,
+         42, -33, -15,  45, -21, -28,  44,  -8, -38,  38,   7, -44,  29,  20, -45,  16,
+         32, -42,   2,  40, -35, -12,  45, -24, -25,  45, -11, -36,  40,   3, -43,  31,
+         17, -45,  19,  30, -43,   6,  39, -37,  -9,  44, -27, -22,  45, -14, -34,  41,
+    }, {
+         32, -20, -27,  45, -13, -33,  43,  -6, -38,  39,   2, -41,  35,  10, -44,  30,
+         17, -45,  23,  24, -45,  16,  30, -44,   9,  36, -41,   1,  40, -37,  -7,  43,
+        -32, -14,  45, -26, -21,  45, -19, -28,  44, -12, -34,  42,  -4, -38,  39,   3,
+        -42,  34,  11, -44,  29,  18, -45,  22,  25, -45,  15,  31, -43,   8,  36, -40,
+    }, {
+         32, -22, -23,  45, -21, -24,  45, -20, -25,  45, -19, -26,  45, -18, -27,  45,
+        -17, -28,  45, -16, -29,  45, -15, -30,  44, -14, -30,  44, -13, -31,  44, -12,
+        -32,  44, -11, -33,  43, -10, -34,  43,  -9, -34,  43,  -8, -35,  42,  -7, -36,
+         42,  -6, -36,  41,  -4, -37,  41,  -3, -38,  40,  -2, -38,  40,  -1, -39,  39,
+    }, {
+         32, -24, -19,  45, -29, -14,  44, -33,  -9,  42, -36,  -3,  40, -39,   2,  37,
+        -42,   8,  34, -44,  13,  30, -45,  18,  25, -45,  23,  20, -45,  28,  15, -44,
+         32,  10, -43,  36,   4, -40,  39,  -1, -38,  41,  -7, -34,  43, -12, -30,  45,
+        -17, -26,  45, -22, -21,  45, -27, -16,  44, -31, -11,  43, -35,  -6,  41, -38,
+    }, {
+         32, -26, -15,  44, -35,  -3,  39, -41,   9,  31, -45,  20,  21, -45,  30,  10,
+        -42,  38,  -2, -36,  43, -14, -27,  45, -25, -16,  44, -34,  -4,  39, -41,   8,
+         32, -45,  19,  22, -45,  30,  11, -42,  38,  -1, -36,  43, -13, -28,  45, -24,
+        -17,  44, -34,  -6,  40, -40,   7,  33, -44,  18,  23, -45,  29,  12, -43,  37,
+    }, {
+         32, -28, -11,  41, -40,   8,  30, -45,  25,  14, -43,  38,  -4, -33,  45, -22,
+        -17,  44, -36,   1,  35, -44,  19,  20, -44,  34,   2, -37,  43, -16, -23,  45,
+        -32,  -6,  39, -42,  13,  26, -45,  30,   9, -40,  41, -10, -29,  45, -27, -12,
+         42, -39,   7,  31, -45,  24,  15, -43,  38,  -3, -34,  45, -21, -18,  44, -36,
+    }, {
+         32, -30,  -7,  38, -43,  18,  19, -44,  38,  -6, -30,  45, -29,  -8,  39, -43,
+         17,  20, -44,  37,  -4, -31,  45, -28,  -9,  39, -43,  16,  21, -44,  36,  -3,
+        -32,  45, -27, -10,  40, -42,  15,  22, -44,  36,  -2, -33,  45, -26, -11,  40,
+        -42,  14,  23, -45,  35,  -1, -34,  45, -25, -12,  41, -41,  13,  24, -45,  34,
+    }, {
+         32, -31,  -2,  34, -45,  28,   7, -37,  44, -24, -11,  39, -43,  20,  15, -41,
+         42, -16, -19,  43, -40,  12,  23, -44,  38,  -8, -27,  45, -35,   3,  30, -45,
+         32,   1, -34,  45, -29,  -6,  36, -45,  25,  10, -39,  44, -21, -14,  41, -42,
+         17,  18, -43,  40, -13, -22,  44, -38,   9,  26, -45,  36,  -4, -30,  45, -33,
+    }, {
+         32, -33,   2,  30, -45,  36,  -7, -26,  44, -38,  11,  22, -43,  40, -15, -18,
+         42, -42,  19,  14, -40,  44, -23, -10,  38, -45,  27,   6, -35,  45, -30,  -1,
+         32, -45,  34,  -3, -29,  45, -36,   8,  25, -44,  39, -12, -21,  43, -41,  16,
+         17, -41,  43, -20, -13,  39, -44,  24,   9, -37,  45, -28,  -4,  34, -45,  31,
+    }, {
+         32, -34,   7,  24, -43,  41, -19, -12,  38, -45,  30,  -1, -29,  45, -39,  14,
+         17, -40,  44, -26,  -4,  33, -45,  36,  -9, -22,  43, -42,  21,  10, -36,  45,
+        -32,   3,  27, -44,  40, -16, -15,  39, -44,  28,   2, -31,  45, -37,  11,  20,
+        -42,  43, -23,  -8,  35, -45,  34,  -6, -25,  44, -41,  18,  13, -38,  45, -30,
+    }, {
+         32, -36,  11,  18, -40,  45, -30,   3,  25, -43,  43, -24,  -4,  31, -45,  39,
+        -17, -12,  36, -45,  35, -10, -19,  40, -44,  30,  -2, -26,  43, -42,  23,   6,
+        -32,  45, -39,  16,  13, -37,  45, -34,   9,  20, -41,  44, -29,   1,  27, -44,
+         42, -22,  -7,  33, -45,  38, -15, -14,  38, -45,  34,  -8, -21,  41, -44,  28,
+    }, {
+         32, -37,  15,  12, -35,  45, -39,  18,   9, -33,  45, -40,  21,   6, -30,  44,
+        -42,  24,   2, -28,  43, -43,  27,  -1, -25,  42, -44,  30,  -4, -22,  41, -45,
+         32,  -8, -19,  39, -45,  34, -11, -16,  38, -45,  36, -14, -13,  36, -45,  38,
+        -17, -10,  34, -45,  40, -20,  -7,  31, -44,  41, -23,  -3,  29, -44,  43, -26,
+    }, {
+         32, -38,  19,   6, -29,  43, -44,  31,  -9, -16,  36, -45,  40, -22,  -2,  26,
+        -42,  45, -34,  12,  13, -34,  45, -41,  25,  -1, -23,  40, -45,  36, -15, -10,
+         32, -44,  43, -28,   4,  20, -39,  45, -38,  18,   7, -30,  43, -44,  30,  -8,
+        -17,  37, -45,  39, -21,  -3,  27, -42,  44, -33,  11,  14, -35,  45, -41,  24,
+    }, {
+         32, -39,  23,  -1, -21,  38, -45,  40, -25,   3,  19, -37,  45, -41,  27,  -6,
+        -17,  36, -45,  42, -29,   8,  15, -34,  44, -43,  30, -10, -13,  33, -44,  44,
+        -32,  12,  11, -31,  43, -44,  34, -14,  -9,  30, -43,  45, -35,  16,   7, -28,
+         42, -45,  36, -18,  -4,  26, -41,  45, -38,  20,   2, -24,  40, -45,  39, -22,
+    }, {
+         32, -40,  27,  -8, -13,  31, -43,  45, -38,  22,  -2, -18,  35, -44,  44, -34,
+         17,   3, -23,  38, -45,  42, -30,  12,   9, -28,  41, -45,  40, -26,   7,  14,
+        -32,  43, -45,  37, -21,   1,  19, -36,  44, -44,  34, -16,  -4,  24, -39,  45,
+        -42,  30, -11, -10,  29, -41,  45, -39,  25,  -6, -15,  33, -43,  45, -36,  20,
+    }, {
+         32, -41,  30, -14,  -4,  22, -36,  44, -44,  37, -23,   6,  13, -30,  41, -45,
+         42, -31,  15,   3, -21,  36, -44,  45, -38,  24,  -7, -12,  29, -40,  45, -42,
+         32, -16,  -2,  20, -35,  44, -45,  38, -25,   8,  11, -28,  40, -45,  43, -33,
+         17,   1, -19,  34, -43,  45, -39,  26,  -9, -10,  27, -39,  45, -43,  34, -18,
+    }, {
+         32, -42,  34, -20,   4,  12, -27,  38, -44,  45, -39,  28, -13,  -3,  19, -33,
+         42, -45,  43, -34,  21,  -6, -11,  26, -38,  44, -45,  39, -29,  14,   2, -18,
+         32, -41,  45, -43,  35, -22,   7,  10, -25,  37, -44,  45, -40,  30, -15,  -1,
+         17, -31,  41, -45,  43, -36,  23,  -8,  -9,  24, -36,  44, -45,  40, -30,  16,
+    }, {
+         32, -43,  36, -26,  13,   1, -15,  28, -38,  44, -45,  42, -35,  24, -11,  -3,
+         17, -30,  39, -44,  45, -41,  34, -22,   9,   6, -19,  31, -40,  45, -45,  40,
+        -32,  20,  -7,  -8,  21, -33,  41, -45,  44, -39,  30, -18,   4,  10, -23,  34,
+        -42,  45, -44,  38, -29,  16,  -2, -12,  25, -36,  43, -45,  43, -37,  27, -14,
+    }, {
+         32, -44,  39, -31,  21, -10,  -2,  14, -25,  34, -41,  45, -45,  42, -36,  28,
+        -17,   6,   7, -18,  29, -37,  43, -45,  44, -40,  34, -24,  13,  -1, -11,  22,
+        -32,  39, -44,  45, -43,  38, -30,  20,  -9,  -3,  15, -26,  35, -41,  45, -45,
+         42, -36,  27, -16,   4,   8, -19,  30, -38,  43, -45,  44, -40,  33, -23,  12,
+    }, {
+         32, -44,  41, -36,  29, -20,  11,  -1,  -9,  18, -27,  34, -40,  44, -45,  45,
+        -42,  37, -30,  22, -13,   3,   7, -16,  25, -33,  39, -43,  45, -45,  43, -38,
+         32, -24,  15,  -6,  -4,  14, -23,  31, -38,  42, -45,  45, -43,  39, -34,  26,
+        -17,   8,   2, -12,  21, -30,  36, -41,  44, -45,  44, -40,  35, -28,  19, -10,
+    }, {
+         32, -45,  43, -39,  35, -30,  23, -16,   9,  -1,  -7,  14, -21,  28, -34,  38,
+        -42,  44, -45,  45, -43,  40, -36,  31, -25,  18, -11,   3,   4, -12,  19, -26,
+         32, -37,  41, -44,  45, -45,  44, -41,  38, -33,  27, -20,  13,  -6,  -2,  10,
+        -17,  24, -30,  36, -40,  43, -45,  45, -44,  42, -39,  34, -29,  22, -15,   8,
+    }, {
+         32, -45,  44, -42,  40, -37,  34, -30,  25, -20,  15, -10,   4,   1,  -7,  12,
+        -17,  22, -27,  31, -35,  38, -41,  43, -44,  45, -45,  45, -43,  41, -39,  36,
+        -32,  28, -23,  18, -13,   8,  -2,  -3,   9, -14,  19, -24,  29, -33,  36, -39,
+         42, -44,  45, -45,  45, -44,  43, -40,  38, -34,  30, -26,  21, -16,  11,  -6,
+    }, {
+         32, -45,  45, -44,  43, -42,  41, -39,  38, -36,  34, -31,  29, -26,  23, -20,
+         17, -14,  11,  -8,   4,  -1,  -2,   6,  -9,  12, -15,  18, -21,  24, -27,  30,
+        -32,  34, -36,  38, -40,  41, -43,  44, -44,  45, -45,  45, -45,  45, -44,  43,
+        -42,  40, -39,  37, -35,  33, -30,  28, -25,  22, -19,  16, -13,  10,  -7,   3,
+    }, {
+         32, -45,  45, -45,  45, -45,  45, -45,  44, -44,  44, -44,  43, -43,  43, -42,
+         42, -41,  41, -40,  40, -39,  39, -38,  38, -37,  36, -36,  35, -34,  34, -33,
+         32, -31,  30, -30,  29, -28,  27, -26,  25, -24,  23, -22,  21, -20,  19, -18,
+         17, -16,  15, -14,  13, -12,  11, -10,   9,  -8,   7,  -6,   4,  -3,   2,  -1,
+    }
+};
diff --git a/src/shaders/icc.c b/src/shaders/icc.c
new file mode 100644
index 0000000..6a16cfd
--- /dev/null
+++ b/src/shaders/icc.c
@@ -0,0 +1,781 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/tone_mapping.h>
+#include <libplacebo/shaders/icc.h>
+
+const struct pl_icc_params pl_icc_default_params = { PL_ICC_DEFAULTS };
+
+#ifdef PL_HAVE_LCMS
+
+#include <lcms2.h>
+#include <lcms2_plugin.h>
+
+struct icc_priv {
+    pl_log log;
+    pl_cache cache; // for backwards compatibility
+    cmsContext cms;
+    cmsHPROFILE profile;
+    cmsHPROFILE approx; // approximation profile
+    float a, b, scale; // approxmation tone curve parameters and scaling
+    cmsCIEXYZ black;
+    float gamma_stddev;
+    uint64_t lut_sig;
+};
+
+static void error_callback(cmsContext cms, cmsUInt32Number code,
+                           const char *msg)
+{
+    pl_log log = cmsGetContextUserData(cms);
+    pl_err(log, "lcms2: [%d] %s", (int) code, msg);
+}
+
+static void set_callback(void *priv, pl_cache_obj obj)
+{
+    pl_icc_object icc = priv;
+    icc->params.cache_save(icc->params.cache_priv, obj.key, obj.data, obj.size);
+}
+
+static pl_cache_obj get_callback(void *priv, uint64_t key)
+{
+    pl_icc_object icc = priv;
+    int s_r = icc->params.size_r, s_g = icc->params.size_g, s_b = icc->params.size_b;
+    size_t data_size = s_r * s_g * s_b * sizeof(uint16_t[4]);
+    void *data = pl_alloc(NULL, data_size);
+    bool ok = icc->params.cache_load(icc->params.cache_priv, key, data, data_size);
+    if (!ok) {
+        pl_free(data);
+        return (pl_cache_obj) {0};
+    }
+
+    return (pl_cache_obj) {
+        .key  = key,
+        .data = data,
+        .size = data_size,
+        .free = pl_free,
+    };
+}
+
+void pl_icc_close(pl_icc_object *picc)
+{
+    pl_icc_object icc = *picc;
+    if (!icc)
+        return;
+
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsCloseProfile(p->approx);
+    cmsCloseProfile(p->profile);
+    cmsDeleteContext(p->cms);
+    pl_cache_destroy(&p->cache);
+    pl_free_ptr((void **) picc);
+}
+
+static bool detect_csp(pl_icc_object icc, struct pl_raw_primaries *prim,
+                       float *out_gamma)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsHTRANSFORM tf;
+    cmsHPROFILE xyz = cmsCreateXYZProfileTHR(p->cms);
+    if (!xyz)
+        return false;
+
+    // We need to use an unadapted observer to get the raw values
+    cmsFloat64Number prev_adapt = cmsSetAdaptationStateTHR(p->cms, 0.0);
+    tf = cmsCreateTransformTHR(p->cms, p->profile, TYPE_RGB_8, xyz, TYPE_XYZ_DBL,
+                               INTENT_ABSOLUTE_COLORIMETRIC,
+                               /* Note: These flags mostly don't do anything
+                                * anyway, but specify them regardless */
+                               cmsFLAGS_NOCACHE |
+                               cmsFLAGS_NOOPTIMIZE);
+    cmsSetAdaptationStateTHR(p->cms, prev_adapt);
+    cmsCloseProfile(xyz);
+    if (!tf)
+        return false;
+
+    enum {
+        RED,
+        GREEN,
+        BLUE,
+        WHITE,
+        BLACK,
+        GRAY,
+        RAMP,
+    };
+
+    static const uint8_t test[][3] = {
+        [RED]   = { 0xFF,    0,    0 },
+        [GREEN] = {    0, 0xFF,    0 },
+        [BLUE]  = {    0,    0, 0xFF },
+        [WHITE] = { 0xFF, 0xFF, 0xFF },
+        [BLACK] = { 0x00, 0x00, 0x00 },
+        [GRAY]  = { 0x80, 0x80, 0x80 },
+
+        // Grayscale ramp (excluding endpoints)
+#define V(d) { d, d, d }
+                 V(0x01), V(0x02), V(0x03), V(0x04), V(0x05), V(0x06), V(0x07),
+        V(0x08), V(0x09), V(0x0A), V(0x0B), V(0x0C), V(0x0D), V(0x0E), V(0x0F),
+        V(0x10), V(0x11), V(0x12), V(0x13), V(0x14), V(0x15), V(0x16), V(0x17),
+        V(0x18), V(0x19), V(0x1A), V(0x1B), V(0x1C), V(0x1D), V(0x1E), V(0x1F),
+        V(0x20), V(0x21), V(0x22), V(0x23), V(0x24), V(0x25), V(0x26), V(0x27),
+        V(0x28), V(0x29), V(0x2A), V(0x2B), V(0x2C), V(0x2D), V(0x2E), V(0x2F),
+        V(0x30), V(0x31), V(0x32), V(0x33), V(0x34), V(0x35), V(0x36), V(0x37),
+        V(0x38), V(0x39), V(0x3A), V(0x3B), V(0x3C), V(0x3D), V(0x3E), V(0x3F),
+        V(0x40), V(0x41), V(0x42), V(0x43), V(0x44), V(0x45), V(0x46), V(0x47),
+        V(0x48), V(0x49), V(0x4A), V(0x4B), V(0x4C), V(0x4D), V(0x4E), V(0x4F),
+        V(0x50), V(0x51), V(0x52), V(0x53), V(0x54), V(0x55), V(0x56), V(0x57),
+        V(0x58), V(0x59), V(0x5A), V(0x5B), V(0x5C), V(0x5D), V(0x5E), V(0x5F),
+        V(0x60), V(0x61), V(0x62), V(0x63), V(0x64), V(0x65), V(0x66), V(0x67),
+        V(0x68), V(0x69), V(0x6A), V(0x6B), V(0x6C), V(0x6D), V(0x6E), V(0x6F),
+        V(0x70), V(0x71), V(0x72), V(0x73), V(0x74), V(0x75), V(0x76), V(0x77),
+        V(0x78), V(0x79), V(0x7A), V(0x7B), V(0x7C), V(0x7D), V(0x7E), V(0x7F),
+        V(0x80), V(0x81), V(0x82), V(0x83), V(0x84), V(0x85), V(0x86), V(0x87),
+        V(0x88), V(0x89), V(0x8A), V(0x8B), V(0x8C), V(0x8D), V(0x8E), V(0x8F),
+        V(0x90), V(0x91), V(0x92), V(0x93), V(0x94), V(0x95), V(0x96), V(0x97),
+        V(0x98), V(0x99), V(0x9A), V(0x9B), V(0x9C), V(0x9D), V(0x9E), V(0x9F),
+        V(0xA0), V(0xA1), V(0xA2), V(0xA3), V(0xA4), V(0xA5), V(0xA6), V(0xA7),
+        V(0xA8), V(0xA9), V(0xAA), V(0xAB), V(0xAC), V(0xAD), V(0xAE), V(0xAF),
+        V(0xB0), V(0xB1), V(0xB2), V(0xB3), V(0xB4), V(0xB5), V(0xB6), V(0xB7),
+        V(0xB8), V(0xB9), V(0xBA), V(0xBB), V(0xBC), V(0xBD), V(0xBE), V(0xBF),
+        V(0xC0), V(0xC1), V(0xC2), V(0xC3), V(0xC4), V(0xC5), V(0xC6), V(0xC7),
+        V(0xC8), V(0xC9), V(0xCA), V(0xCB), V(0xCC), V(0xCD), V(0xCE), V(0xCF),
+        V(0xD0), V(0xD1), V(0xD2), V(0xD3), V(0xD4), V(0xD5), V(0xD6), V(0xD7),
+        V(0xD8), V(0xD9), V(0xDA), V(0xDB), V(0xDC), V(0xDD), V(0xDE), V(0xDF),
+        V(0xE0), V(0xE1), V(0xE2), V(0xE3), V(0xE4), V(0xE5), V(0xE6), V(0xE7),
+        V(0xE8), V(0xE9), V(0xEA), V(0xEB), V(0xEC), V(0xED), V(0xEE), V(0xEF),
+        V(0xF0), V(0xF1), V(0xF2), V(0xF3), V(0xF4), V(0xF5), V(0xF6), V(0xF7),
+        V(0xF8), V(0xF9), V(0xFA), V(0xFB), V(0xFC), V(0xFD), V(0xFE),
+#undef V
+    };
+
+    cmsCIEXYZ dst[PL_ARRAY_SIZE(test)] = {0};
+    cmsDoTransform(tf, test, dst, PL_ARRAY_SIZE(dst));
+    cmsDeleteTransform(tf);
+
+    // Read primaries from transformed RGBW values
+    prim->red   = pl_cie_from_XYZ(dst[RED].X, dst[RED].Y, dst[RED].Z);
+    prim->green = pl_cie_from_XYZ(dst[GREEN].X, dst[GREEN].Y, dst[GREEN].Z);
+    prim->blue  = pl_cie_from_XYZ(dst[BLUE].X, dst[BLUE].Y, dst[BLUE].Z);
+    prim->white = pl_cie_from_XYZ(dst[WHITE].X, dst[WHITE].Y, dst[WHITE].Z);
+
+    // Rough estimate of overall gamma and starting point for curve black point
+    const float y_approx = dst[GRAY].Y ? log(dst[GRAY].Y) / log(0.5) : 1.0f;
+    const float kb = fmaxf(dst[BLACK].Y, 0.0f);
+    float b = powf(kb, 1 / y_approx);
+
+    // Estimate mean and stddev of gamma (Welford's method)
+    float M = 0.0, S = 0.0;
+    int k = 1;
+    for (int i = RAMP; i < PL_ARRAY_SIZE(dst); i++) { // exclude primaries
+        if (dst[i].Y <= 0 || dst[i].Y >= 1)
+            continue;
+        float src = (1 - b) * (test[i][0] / 255.0) + b;
+        float y = log(dst[i].Y) / log(src);
+        float tmpM = M;
+        M += (y - tmpM) / k;
+        S += (y - tmpM) * (y - M);
+        k++;
+
+        // Update estimate of black point according to current gamma estimate
+        b = powf(kb, 1 / M);
+    }
+    S = sqrt(S / (k - 1));
+
+    PL_INFO(p, "Detected profile approximation gamma %.3f", M);
+    if (S > 0.5) {
+        PL_WARN(p, "Detected profile gamma (%.3f) very far from pure power "
+                "response (stddev=%.1f), suspected unusual or broken profile. "
+                "Using anyway, but results may be poor.", M, S);
+    } else if (!(M > 0)) {
+        PL_ERR(p, "Arithmetic error in ICC profile gamma estimation? "
+               "Please open an issue");
+        return false;
+    }
+
+    *out_gamma = M;
+    p->gamma_stddev = S;
+    return true;
+}
+
+static bool detect_contrast(pl_icc_object icc, struct pl_hdr_metadata *hdr,
+                            struct pl_icc_params *params, float max_luma)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsCIEXYZ *white = cmsReadTag(p->profile, cmsSigLuminanceTag);
+    enum pl_rendering_intent intent = params->intent;
+    /* LittleCMS refuses to detect an intent in absolute colorimetric intent,
+     * so fall back to relative colorimetric since we only care about the
+     * brightness value here */
+    if (intent == PL_INTENT_ABSOLUTE_COLORIMETRIC)
+        intent = PL_INTENT_RELATIVE_COLORIMETRIC;
+    if (!cmsDetectDestinationBlackPoint(&p->black, p->profile, intent, 0)) {
+        /*
+         * v4 ICC profiles have a black point tag but only for
+         * perceptual/saturation intents. So we change the rendering intent
+         * to perceptual if we are provided a v4 ICC profile.
+         */
+        if (cmsGetEncodedICCversion(p->profile) >= 0x4000000 && intent != PL_INTENT_PERCEPTUAL) {
+            params->intent = PL_INTENT_PERCEPTUAL;
+            return detect_contrast(icc, hdr, params, max_luma);
+        }
+
+        PL_ERR(p, "Failed detecting ICC profile black point!");
+        return false;
+    }
+
+    if (white) {
+        PL_DEBUG(p, "Detected raw white point X=%.2f Y=%.2f Z=%.2f cd/m^2",
+                 white->X, white->Y, white->Z);
+    }
+    PL_DEBUG(p, "Detected raw black point X=%.6f%% Y=%.6f%% Z=%.6f%%",
+             p->black.X * 100, p->black.Y * 100, p->black.Z * 100);
+
+    if (max_luma <= 0)
+        max_luma = white ? white->Y : PL_COLOR_SDR_WHITE;
+
+    hdr->max_luma = max_luma;
+    hdr->min_luma = p->black.Y * max_luma;
+    hdr->min_luma = PL_MAX(hdr->min_luma, 1e-6); // prevent true 0
+    PL_INFO(p, "Using ICC contrast %.0f:1", hdr->max_luma / hdr->min_luma);
+    return true;
+}
+
+static void infer_clut_size(struct pl_icc_object_t *icc)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    struct pl_icc_params *params = &icc->params;
+    if (params->size_r && params->size_g && params->size_b) {
+        PL_DEBUG(p, "Using fixed 3DLUT size: %dx%dx%d",
+                 (int) params->size_r, (int) params->size_g, (int) params->size_b);
+        return;
+    }
+
+#define REQUIRE_SIZE(N) \
+    params->size_r = PL_MAX(params->size_r, N); \
+    params->size_g = PL_MAX(params->size_g, N); \
+    params->size_b = PL_MAX(params->size_b, N)
+
+    // Default size for sanity
+    REQUIRE_SIZE(9);
+
+    // Ensure enough precision to track the (absolute) black point
+    if (p->black.Y > 1e-4) {
+        float black_rel = powf(p->black.Y, 1.0f / icc->gamma);
+        int min_size = 2 * (int) ceilf(1.0f / black_rel);
+        REQUIRE_SIZE(min_size);
+    }
+
+    // Ensure enough precision to track the gamma curve
+    if (p->gamma_stddev > 1e-2) {
+        REQUIRE_SIZE(65);
+    } else if (p->gamma_stddev > 1e-3) {
+        REQUIRE_SIZE(33);
+    } else if (p->gamma_stddev > 1e-4) {
+        REQUIRE_SIZE(17);
+    }
+
+    // Ensure enough precision to track any internal CLUTs
+    cmsPipeline *pipe = NULL;
+    switch (icc->params.intent) {
+    case PL_INTENT_SATURATION:
+        pipe = cmsReadTag(p->profile, cmsSigBToA2Tag);
+        if (pipe)
+            break;
+        // fall through
+    case PL_INTENT_RELATIVE_COLORIMETRIC:
+    case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+    default:
+        pipe = cmsReadTag(p->profile, cmsSigBToA1Tag);
+        if (pipe)
+            break;
+        // fall through
+    case PL_INTENT_PERCEPTUAL:
+        pipe = cmsReadTag(p->profile, cmsSigBToA0Tag);
+        break;
+    }
+
+    if (!pipe) {
+        switch (icc->params.intent) {
+        case PL_INTENT_SATURATION:
+            pipe = cmsReadTag(p->profile, cmsSigAToB2Tag);
+            if (pipe)
+                break;
+            // fall through
+        case PL_INTENT_RELATIVE_COLORIMETRIC:
+        case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+        default:
+            pipe = cmsReadTag(p->profile, cmsSigAToB1Tag);
+            if (pipe)
+                break;
+            // fall through
+        case PL_INTENT_PERCEPTUAL:
+            pipe = cmsReadTag(p->profile, cmsSigAToB0Tag);
+            break;
+        }
+    }
+
+    if (pipe) {
+        for (cmsStage *stage = cmsPipelineGetPtrToFirstStage(pipe);
+             stage; stage = cmsStageNext(stage))
+        {
+            switch (cmsStageType(stage)) {
+            case cmsSigCLutElemType: ;
+                _cmsStageCLutData *data = cmsStageData(stage);
+                if (data->Params->nInputs != 3)
+                    continue;
+                params->size_r = PL_MAX(params->size_r, data->Params->nSamples[0]);
+                params->size_g = PL_MAX(params->size_g, data->Params->nSamples[1]);
+                params->size_b = PL_MAX(params->size_b, data->Params->nSamples[2]);
+                break;
+
+            default:
+                continue;
+            }
+        }
+    }
+
+    // Clamp the output size to make sure profiles are not too large
+    params->size_r = PL_MIN(params->size_r, 129);
+    params->size_g = PL_MIN(params->size_g, 129);
+    params->size_b = PL_MIN(params->size_b, 129);
+
+    // Constrain the total LUT size to roughly 1M entries
+    const size_t max_size = 1000000;
+    size_t total_size = params->size_r * params->size_g * params->size_b;
+    if (total_size > max_size) {
+        float factor = powf((float) max_size / total_size, 1/3.0f);
+        params->size_r = ceilf(factor * params->size_r);
+        params->size_g = ceilf(factor * params->size_g);
+        params->size_b = ceilf(factor * params->size_b);
+    }
+
+    PL_INFO(p, "Chosen 3DLUT size: %dx%dx%d",
+            (int) params->size_r, (int) params->size_g, (int) params->size_b);
+}
+
+static bool icc_init(struct pl_icc_object_t *icc)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    struct pl_icc_params *params = &icc->params;
+    if (params->intent < 0 || params->intent > PL_INTENT_ABSOLUTE_COLORIMETRIC)
+        params->intent = cmsGetHeaderRenderingIntent(p->profile);
+
+    struct pl_raw_primaries *out_prim = &icc->csp.hdr.prim;
+    if (!detect_csp(icc, out_prim, &icc->gamma))
+        return false;
+    if (!detect_contrast(icc, &icc->csp.hdr, params, params->max_luma))
+        return false;
+    infer_clut_size(icc);
+
+    const struct pl_raw_primaries *best = NULL;
+    for (enum pl_color_primaries prim = 1; prim < PL_COLOR_PRIM_COUNT; prim++) {
+        const struct pl_raw_primaries *raw = pl_raw_primaries_get(prim);
+        if (!icc->csp.primaries && pl_raw_primaries_similar(raw, out_prim)) {
+            icc->containing_primaries = prim;
+            icc->csp.primaries = prim;
+            best = raw;
+            break;
+        }
+
+        if (pl_primaries_superset(raw, out_prim) &&
+            (!best || pl_primaries_superset(best, raw)))
+        {
+            icc->containing_primaries = prim;
+            best = raw;
+        }
+    }
+
+    if (!best) {
+        PL_WARN(p, "ICC profile too wide to handle, colors may be clipped!");
+        icc->containing_primaries = PL_COLOR_PRIM_ACES_AP0;
+        best = pl_raw_primaries_get(icc->containing_primaries);
+    }
+
+    // Create approximation profile. Use a tone-curve based on a BT.1886-style
+    // pure power curve, with an approximation gamma matched to the ICC
+    // profile. We stretch the luminance range *before* the input to the gamma
+    // function, to avoid numerical issues near the black point. (This removes
+    // the need for a separate linear section)
+    //
+    // Y = scale * (aX + b)^y, where Y = PCS luma and X = encoded value ([0-1])
+    p->scale = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_NORM, icc->csp.hdr.max_luma);
+    p->b = powf(icc->csp.hdr.min_luma / icc->csp.hdr.max_luma, 1.0f / icc->gamma);
+    p->a = (1 - p->b);
+    cmsToneCurve *curve = cmsBuildParametricToneCurve(p->cms, 2,
+            (double[3]) { icc->gamma, p->a, p->b });
+    if (!curve)
+        return false;
+
+    cmsCIExyY wp_xyY = { best->white.x, best->white.y, 1.0 };
+    cmsCIExyYTRIPLE prim_xyY = {
+        .Red   = { best->red.x,   best->red.y,   1.0 },
+        .Green = { best->green.x, best->green.y, 1.0 },
+        .Blue  = { best->blue.x,  best->blue.y,  1.0 },
+    };
+
+    p->approx = cmsCreateRGBProfileTHR(p->cms, &wp_xyY, &prim_xyY,
+                        (cmsToneCurve *[3]){ curve, curve, curve });
+    cmsFreeToneCurve(curve);
+    if (!p->approx)
+        return false;
+
+    // We need to create an ICC V2 profile because ICC V4 perceptual profiles
+    // have normalized semantics, but we want colorimetric mapping with BPC
+    cmsSetHeaderRenderingIntent(p->approx, icc->params.intent);
+    cmsSetProfileVersion(p->approx, 2.2);
+
+    // Hash all parameters affecting the generated 3DLUT
+    p->lut_sig = CACHE_KEY_ICC_3DLUT;
+    pl_hash_merge(&p->lut_sig, icc->signature);
+    pl_hash_merge(&p->lut_sig, params->intent);
+    pl_hash_merge(&p->lut_sig, params->size_r);
+    pl_hash_merge(&p->lut_sig, params->size_g);
+    pl_hash_merge(&p->lut_sig, params->size_b);
+    pl_hash_merge(&p->lut_sig, params->force_bpc);
+    union { double d; uint64_t u; } v = { .d = icc->csp.hdr.max_luma };
+    pl_hash_merge(&p->lut_sig, v.u);
+    // min luma depends only on the max luma and profile
+
+    // Backwards compatibility with old caching API
+    if ((params->cache_save || params->cache_load) && !params->cache) {
+        p->cache = pl_cache_create(pl_cache_params(
+            .log  = p->log,
+            .set  = params->cache_save ? set_callback : NULL,
+            .get  = params->cache_load ? get_callback : NULL,
+            .priv = icc,
+        ));
+    }
+
+    return true;
+}
+
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+                          const struct pl_icc_params *params)
+{
+    if (!profile->len)
+        return NULL;
+
+    struct pl_icc_object_t *icc = pl_zalloc_obj(NULL, icc, struct icc_priv);
+    struct icc_priv *p = PL_PRIV(icc);
+    icc->params = params ? *params : pl_icc_default_params;
+    icc->signature = profile->signature;
+    p->log = log;
+    p->cms = cmsCreateContext(NULL, (void *) log);
+    if (!p->cms) {
+        PL_ERR(p, "Failed creating LittleCMS context!");
+        goto error;
+    }
+
+    cmsSetLogErrorHandlerTHR(p->cms, error_callback);
+    PL_INFO(p, "Opening ICC profile..");
+    p->profile = cmsOpenProfileFromMemTHR(p->cms, profile->data, profile->len);
+    if (!p->profile) {
+        PL_ERR(p, "Failed opening ICC profile");
+        goto error;
+    }
+
+    if (cmsGetColorSpace(p->profile) != cmsSigRgbData) {
+        PL_ERR(p, "Invalid ICC profile: not RGB");
+        goto error;
+    }
+
+    if (!icc_init(icc))
+        goto error;
+
+    return icc;
+
+error:
+    pl_icc_close((pl_icc_object *) &icc);
+    return NULL;
+}
+
+static bool icc_reopen(pl_icc_object kicc, const struct pl_icc_params *params)
+{
+    struct pl_icc_object_t *icc = (struct pl_icc_object_t *) kicc;
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsCloseProfile(p->approx);
+    pl_cache_destroy(&p->cache);
+
+    *icc = (struct pl_icc_object_t) {
+        .params    = *params,
+        .signature = icc->signature,
+    };
+
+    *p = (struct icc_priv) {
+        .log     = p->log,
+        .cms     = p->cms,
+        .profile = p->profile,
+    };
+
+    PL_DEBUG(p, "Reinitializing ICC profile in-place");
+    return icc_init(icc);
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *out_icc,
+                   const struct pl_icc_profile *profile,
+                   const struct pl_icc_params *params)
+{
+    params = PL_DEF(params, &pl_icc_default_params);
+    pl_icc_object icc = *out_icc;
+    if (!icc && !profile)
+        return false; // nothing to update
+
+    uint64_t sig = profile ? profile->signature : icc->signature;
+    if (!icc || icc->signature != sig) {
+        pl_assert(profile);
+        pl_icc_close(&icc);
+        *out_icc = icc = pl_icc_open(log, profile, params);
+        return icc != NULL;
+    }
+
+    int size_r = PL_DEF(params->size_r, icc->params.size_r);
+    int size_g = PL_DEF(params->size_g, icc->params.size_g);
+    int size_b = PL_DEF(params->size_b, icc->params.size_b);
+    bool compat = params->intent     == icc->params.intent    &&
+                  params->max_luma   == icc->params.max_luma  &&
+                  params->force_bpc  == icc->params.force_bpc &&
+                  size_r             == icc->params.size_r    &&
+                  size_g             == icc->params.size_g    &&
+                  size_b             == icc->params.size_b;
+    if (compat)
+        return true;
+
+    // ICC signature is the same but parameters are different, re-open in-place
+    if (!icc_reopen(icc, params)) {
+        pl_icc_close(&icc);
+        *out_icc = NULL;
+        return false;
+    }
+
+    return true;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params, bool decode)
+{
+    pl_icc_object icc = params->priv;
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsHPROFILE srcp = decode ? p->profile : p->approx;
+    cmsHPROFILE dstp = decode ? p->approx  : p->profile;
+    int s_r = params->width, s_g = params->height, s_b = params->depth;
+
+    pl_clock_t start = pl_clock_now();
+    cmsHTRANSFORM tf = cmsCreateTransformTHR(p->cms, srcp, TYPE_RGB_16,
+                                             dstp, TYPE_RGBA_16,
+                                             icc->params.intent,
+                                             cmsFLAGS_BLACKPOINTCOMPENSATION |
+                                             cmsFLAGS_NOCACHE | cmsFLAGS_NOOPTIMIZE);
+    if (!tf)
+        return;
+
+    pl_clock_t after_transform = pl_clock_now();
+    pl_log_cpu_time(p->log, start, after_transform, "creating ICC transform");
+
+    uint16_t *tmp = pl_alloc(NULL, s_r * 3 * sizeof(tmp[0]));
+    for (int b = 0; b < s_b; b++) {
+        for (int g = 0; g < s_g; g++) {
+            // Transform a single line of the output buffer
+            for (int r = 0; r < s_r; r++) {
+                tmp[r * 3 + 0] = r * 65535 / (s_r - 1);
+                tmp[r * 3 + 1] = g * 65535 / (s_g - 1);
+                tmp[r * 3 + 2] = b * 65535 / (s_b - 1);
+            }
+
+            size_t offset = (b * s_g + g) * s_r * 4;
+            uint16_t *data = ((uint16_t *) datap) + offset;
+            cmsDoTransform(tf, tmp, data, s_r);
+
+            if (!icc->params.force_bpc)
+                continue;
+
+            // Fix the black point manually. Work-around for "improper"
+            // profiles, as black point compensation should already have
+            // taken care of this normally.
+            const uint16_t knee = 16u << 8;
+            if (tmp[0] >= knee || tmp[1] >= knee)
+                continue;
+            for (int r = 0; r < s_r; r++) {
+                uint16_t s = (2 * tmp[1] + tmp[2] + tmp[r * 3]) >> 2;
+                if (s >= knee)
+                    break;
+                for (int c = 0; c < 3; c++)
+                    data[r * 3 + c] = (s * data[r * 3 + c] + (knee - s) * s) >> 12;
+            }
+        }
+    }
+
+    pl_log_cpu_time(p->log, after_transform, pl_clock_now(), "generating ICC 3DLUT");
+    cmsDeleteTransform(tf);
+    pl_free(tmp);
+}
+
+static void fill_decode(void *datap, const struct sh_lut_params *params)
+{
+    fill_lut(datap, params, true);
+}
+
+static void fill_encode(void *datap, const struct sh_lut_params *params)
+{
+    fill_lut(datap, params, false);
+}
+
+static pl_cache get_cache(pl_icc_object icc, pl_shader sh)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    return PL_DEF(icc->params.cache, PL_DEF(p->cache, SH_CACHE(sh)));
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+                   struct pl_color_space *out_csp)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+    if (!fmt) {
+        SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+        return;
+    }
+
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = lut_obj,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_TETRAHEDRAL,
+        .fmt        = fmt,
+        .width      = icc->params.size_r,
+        .height     = icc->params.size_g,
+        .depth      = icc->params.size_b,
+        .comps      = 4,
+        .signature  = p->lut_sig,
+        .fill       = fill_decode,
+        .cache      = get_cache(icc, sh),
+        .priv       = (void *) icc,
+    ));
+
+    if (!lut) {
+        SH_FAIL(sh, "pl_icc_decode: failed generating LUT object");
+        return;
+    }
+
+    // Y = scale * (aX + b)^y
+    sh_describe(sh, "ICC 3DLUT");
+    GLSL("// pl_icc_decode                          \n"
+         "{                                         \n"
+         "color.rgb = "$"(color.rgb).rgb;           \n"
+         "color.rgb = "$" * color.rgb + vec3("$");  \n"
+         "color.rgb = pow(color.rgb, vec3("$"));    \n"
+         "color.rgb = "$" * color.rgb;              \n"
+         "}                                         \n",
+         lut,
+         SH_FLOAT(p->a), SH_FLOAT(p->b),
+         SH_FLOAT(icc->gamma),
+         SH_FLOAT(p->scale));
+
+    if (out_csp) {
+        *out_csp = (struct pl_color_space) {
+            .primaries  = icc->containing_primaries,
+            .transfer   = PL_COLOR_TRC_LINEAR,
+            .hdr        = icc->csp.hdr,
+        };
+    }
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+    if (!fmt) {
+        SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+        return;
+    }
+
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = lut_obj,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_TETRAHEDRAL,
+        .fmt        = fmt,
+        .width      = icc->params.size_r,
+        .height     = icc->params.size_g,
+        .depth      = icc->params.size_b,
+        .comps      = 4,
+        .signature  = ~p->lut_sig, // avoid confusion with decoding LUTs
+        .fill       = fill_encode,
+        .cache      = get_cache(icc, sh),
+        .priv       = (void *) icc,
+    ));
+
+    if (!lut) {
+        SH_FAIL(sh, "pl_icc_encode: failed generating LUT object");
+        return;
+    }
+
+    // X = 1/a * (Y/scale)^(1/y) - b/a
+    sh_describe(sh, "ICC 3DLUT");
+    GLSL("// pl_icc_encode                          \n"
+         "{                                         \n"
+         "color.rgb = max(color.rgb, 0.0);          \n"
+         "color.rgb = 1.0/"$" * color.rgb;          \n"
+         "color.rgb = pow(color.rgb, vec3("$"));    \n"
+         "color.rgb = 1.0/"$" * color.rgb - "$";    \n"
+         "color.rgb = "$"(color.rgb).rgb;           \n"
+         "}                                         \n",
+         SH_FLOAT(p->scale),
+         SH_FLOAT(1.0f / icc->gamma),
+         SH_FLOAT(p->a), SH_FLOAT(p->b / p->a),
+         lut);
+}
+
+#else // !PL_HAVE_LCMS
+
+void pl_icc_close(pl_icc_object *picc) {};
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+                          const struct pl_icc_params *pparams)
+{
+    pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+    return NULL;
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *obj,
+                   const struct pl_icc_profile *profile,
+                   const struct pl_icc_params *params)
+{
+    static bool warned;
+    if (!warned) {
+        pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+        warned = true;
+    }
+    *obj = NULL;
+    return false;
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+                   struct pl_color_space *out_csp)
+{
+    pl_unreachable(); // can't get a pl_icc_object
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+    pl_unreachable();
+}
+
+#endif
diff --git a/src/shaders/lut.c b/src/shaders/lut.c
new file mode 100644
index 0000000..b0124fc
--- /dev/null
+++ b/src/shaders/lut.c
@@ -0,0 +1,820 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <ctype.h>
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/lut.h>
+
+static inline bool isnumeric(char c)
+{
+    return (c >= '0' && c <= '9') || c == '-';
+}
+
+void pl_lut_free(struct pl_custom_lut **lut)
+{
+    pl_free_ptr(lut);
+}
+
+struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *cstr, size_t cstr_len)
+{
+    struct pl_custom_lut *lut = pl_zalloc_ptr(NULL, lut);
+    pl_str str = (pl_str) { (uint8_t *) cstr, cstr_len };
+    lut->signature = pl_str_hash(str);
+    int entries = 0;
+
+    float min[3] = { 0.0, 0.0, 0.0 };
+    float max[3] = { 1.0, 1.0, 1.0 };
+
+    // Parse header
+    while (str.len && !isnumeric(str.buf[0])) {
+        pl_str line = pl_str_strip(pl_str_getline(str, &str));
+        if (!line.len)
+            continue; // skip empty line
+
+        if (pl_str_eatstart0(&line, "TITLE")) {
+            pl_info(log, "Loading LUT: %.*s", PL_STR_FMT(pl_str_strip(line)));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "LUT_3D_SIZE")) {
+            line = pl_str_strip(line);
+            int size;
+            if (!pl_str_parse_int(line, &size)) {
+                pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            if (size <= 0 || size > 1024) {
+                pl_err(log, "Invalid 3DLUT size: %dx%d%x", size, size, size);
+                goto error;
+            }
+
+            lut->size[0] = lut->size[1] = lut->size[2] = size;
+            entries = size * size * size;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "LUT_1D_SIZE")) {
+            line = pl_str_strip(line);
+            int size;
+            if (!pl_str_parse_int(line, &size)) {
+                pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            if (size <= 0 || size > 65536) {
+                pl_err(log, "Invalid 1DLUT size: %d", size);
+                goto error;
+            }
+
+            lut->size[0] = size;
+            lut->size[1] = lut->size[2] = 0;
+            entries = size;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DOMAIN_MIN")) {
+            line = pl_str_strip(line);
+            if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[0]) ||
+                !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[1]) ||
+                !pl_str_parse_float(line, &min[2]))
+            {
+                pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DOMAIN_MAX")) {
+            line = pl_str_strip(line);
+            if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[0]) ||
+                !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[1]) ||
+                !pl_str_parse_float(line, &max[2]))
+            {
+                pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "#")) {
+            pl_debug(log, "Unhandled .cube comment: %.*s",
+                     PL_STR_FMT(pl_str_strip(line)));
+            continue;
+        }
+
+        pl_warn(log, "Unhandled .cube line: %.*s", PL_STR_FMT(pl_str_strip(line)));
+    }
+
+    if (!entries) {
+        pl_err(log, "Missing LUT size specification?");
+        goto error;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        if (max[i] - min[i] < 1e-6) {
+            pl_err(log, "Invalid domain range: [%f, %f]", min[i], max[i]);
+            goto error;
+        }
+    }
+
+    float *data = pl_alloc(lut, sizeof(float[3]) * entries);
+    lut->data = data;
+
+    // Parse LUT body
+    pl_clock_t start = pl_clock_now();
+    for (int n = 0; n < entries; n++) {
+        for (int c = 0; c < 3; c++) {
+            static const char * const digits = "0123456789.-+e";
+
+            // Extract valid digit sequence
+            size_t len = pl_strspn(str, digits);
+            pl_str entry = (pl_str) { str.buf, len };
+            str.buf += len;
+            str.len -= len;
+
+            if (!entry.len) {
+                if (!str.len) {
+                    pl_err(log, "Failed parsing LUT: Unexpected EOF, expected "
+                           "%d entries, got %d", entries * 3, n * 3 + c + 1);
+                } else {
+                    pl_err(log, "Failed parsing LUT: Unexpected '%c', expected "
+                           "digit", str.buf[0]);
+                }
+                goto error;
+            }
+
+            float num;
+            if (!pl_str_parse_float(entry, &num)) {
+                pl_err(log, "Failed parsing float value '%.*s'", PL_STR_FMT(entry));
+                goto error;
+            }
+
+            // Rescale to range 0.0 - 1.0
+            *data++ = (num - min[c]) / (max[c] - min[c]);
+
+            // Skip whitespace between digits
+            str = pl_str_strip(str);
+        }
+    }
+
+    str = pl_str_strip(str);
+    if (str.len)
+        pl_warn(log, "Extra data after LUT?... ignoring '%c'", str.buf[0]);
+
+    pl_log_cpu_time(log, start, pl_clock_now(), "parsing .cube LUT");
+    return lut;
+
+error:
+    pl_free(lut);
+    return NULL;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params)
+{
+    const struct pl_custom_lut *lut = params->priv;
+
+    int dim_r = params->width;
+    int dim_g = PL_DEF(params->height, 1);
+    int dim_b = PL_DEF(params->depth, 1);
+
+    float *data = datap;
+    for (int b = 0; b < dim_b; b++) {
+        for (int g = 0; g < dim_g; g++) {
+            for (int r = 0; r < dim_r; r++) {
+                size_t offset = (b * dim_g + g) * dim_r + r;
+                const float *src = &lut->data[offset * 3];
+                float *dst = &data[offset * 4];
+                dst[0] = src[0];
+                dst[1] = src[1];
+                dst[2] = src[2];
+                dst[3] = 0.0f;
+            }
+        }
+    }
+}
+
+void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut,
+                          pl_shader_obj *lut_state)
+{
+    if (!lut)
+        return;
+
+    int dims;
+    if (lut->size[0] > 0 && lut->size[1] > 0 && lut->size[2] > 0) {
+        dims = 3;
+    } else if (lut->size[0] > 0 && !lut->size[1] && !lut->size[2]) {
+        dims = 1;
+    } else {
+        SH_FAIL(sh, "Invalid dimensions %dx%dx%d for pl_custom_lut, must be 1D "
+                "or 3D!", lut->size[0], lut->size[1], lut->size[2]);
+        return;
+    }
+
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    ident_t fun = sh_lut(sh, sh_lut_params(
+        .object     = lut_state,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_TETRAHEDRAL,
+        .width      = lut->size[0],
+        .height     = lut->size[1],
+        .depth      = lut->size[2],
+        .comps      = 4, // for better texel alignment
+        .signature  = lut->signature,
+        .fill       = fill_lut,
+        .priv       = (void *) lut,
+    ));
+
+    if (!fun) {
+        SH_FAIL(sh, "pl_shader_custom_lut: failed generating LUT object");
+        return;
+    }
+
+    GLSL("// pl_shader_custom_lut \n");
+
+    static const pl_matrix3x3 zero = {0};
+    if (memcmp(&lut->shaper_in, &zero, sizeof(zero)) != 0) {
+        GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_mat3("shaper_in"),
+            .data = PL_TRANSPOSE_3X3(lut->shaper_in.m),
+        }));
+    }
+
+    switch (dims) {
+    case 1:
+        sh_describe(sh, "custom 1DLUT");
+        GLSL("color.rgb = vec3("$"(color.r).r,  \n"
+             "                 "$"(color.g).g,  \n"
+             "                 "$"(color.b).b); \n",
+             fun, fun, fun);
+        break;
+    case 3:
+        sh_describe(sh, "custom 3DLUT");
+        GLSL("color.rgb = "$"(color.rgb).rgb; \n", fun);
+        break;
+    }
+
+    if (memcmp(&lut->shaper_out, &zero, sizeof(zero)) != 0) {
+        GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_mat3("shaper_out"),
+            .data = PL_TRANSPOSE_3X3(lut->shaper_out.m),
+        }));
+    }
+}
+
+// Defines a LUT position helper macro. This translates from an absolute texel
+// scale (either in texels, or normalized to [0,1]) to the texture coordinate
+// scale for the corresponding sample in a texture of dimension `lut_size`.
+static ident_t texel_scale(pl_shader sh, int lut_size, bool normalized)
+{
+    const float base = 0.5f / lut_size;
+    const float end = 1.0f - 0.5f / lut_size;
+    const float scale = (end - base) / (normalized ? 1.0f : (lut_size - 1));
+
+    ident_t name = sh_fresh(sh, "LUT_SCALE");
+    GLSLH("#define "$"(x) ("$" * (x) + "$") \n",
+          name, SH_FLOAT(scale), SH_FLOAT(base));
+    return name;
+}
+
+struct sh_lut_obj {
+    enum sh_lut_type type;
+    enum sh_lut_method method;
+    enum pl_var_type vartype;
+    pl_fmt fmt;
+    int width, height, depth, comps;
+    uint64_t signature;
+    bool error; // reset if params change
+
+    // weights, depending on the lut type
+    pl_tex tex;
+    pl_str str;
+    void *data;
+};
+
+static void sh_lut_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_lut_obj *lut = ptr;
+    pl_tex_destroy(gpu, &lut->tex);
+    pl_free(lut->str.buf);
+    pl_free(lut->data);
+
+    *lut = (struct sh_lut_obj) {0};
+}
+
+// Maximum number of floats to embed as a literal array (when using SH_LUT_AUTO)
+#define SH_LUT_MAX_LITERAL_SOFT 64
+#define SH_LUT_MAX_LITERAL_HARD 256
+
+ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params)
+{
+    pl_gpu gpu = SH_GPU(sh);
+    pl_cache_obj obj = { .key = CACHE_KEY_SH_LUT ^ params->signature };
+
+    const enum pl_var_type vartype = params->var_type;
+    pl_assert(vartype != PL_VAR_INVALID);
+    pl_assert(params->method == SH_LUT_NONE || vartype == PL_VAR_FLOAT);
+    pl_assert(params->width > 0 && params->height >= 0 && params->depth >= 0);
+    pl_assert(params->comps > 0);
+    pl_assert(!params->cache || params->signature);
+
+    int sizes[] = { params->width, params->height, params->depth };
+    int size = params->width * PL_DEF(params->height, 1) * PL_DEF(params->depth, 1);
+    int dims = params->depth ? 3 : params->height ? 2 : 1;
+    enum sh_lut_method method = params->method;
+    if (method == SH_LUT_TETRAHEDRAL && dims != 3)
+        method = SH_LUT_LINEAR;
+    if (method == SH_LUT_CUBIC && dims != 3)
+        method = SH_LUT_LINEAR;
+
+    int texdim = 0;
+    uint32_t max_tex_dim[] = {
+        gpu ? gpu->limits.max_tex_1d_dim : 0,
+        gpu ? gpu->limits.max_tex_2d_dim : 0,
+        (gpu && gpu->glsl.version > 100) ? gpu->limits.max_tex_3d_dim : 0,
+    };
+
+    struct sh_lut_obj *lut = SH_OBJ(sh, params->object, PL_SHADER_OBJ_LUT,
+                                    struct sh_lut_obj, sh_lut_uninit);
+
+    if (!lut)
+        return NULL_IDENT;
+
+    bool update = params->update || lut->signature != params->signature ||
+                  vartype != lut->vartype || params->fmt != lut->fmt ||
+                  params->width != lut->width || params->height != lut->height ||
+                  params->depth != lut->depth || params->comps != lut->comps;
+
+    if (lut->error && !update)
+        return NULL_IDENT; // suppress error spam until something changes
+
+    // Try picking the right number of dimensions for the texture LUT. This
+    // allows e.g. falling back to 2D textures if 1D textures are unsupported.
+    for (int d = dims; d <= PL_ARRAY_SIZE(max_tex_dim); d++) {
+        // For a given dimension to be compatible, all coordinates need to be
+        // within the maximum texture size for that dimension
+        for (int i = 0; i < d; i++) {
+            if (sizes[i] > max_tex_dim[d - 1])
+                goto next_dim;
+        }
+
+        // All dimensions are compatible, so pick this texture dimension
+        texdim = d;
+        break;
+
+next_dim: ; // `continue` out of the inner loop
+    }
+
+    static const enum pl_fmt_type fmt_type[PL_VAR_TYPE_COUNT] = {
+        [PL_VAR_SINT]   = PL_FMT_SINT,
+        [PL_VAR_UINT]   = PL_FMT_UINT,
+        [PL_VAR_FLOAT]  = PL_FMT_FLOAT,
+    };
+
+    enum pl_fmt_caps texcaps = PL_FMT_CAP_SAMPLEABLE;
+    bool is_linear = method == SH_LUT_LINEAR || method == SH_LUT_CUBIC;
+    if (is_linear)
+        texcaps |= PL_FMT_CAP_LINEAR;
+
+    pl_fmt texfmt = params->fmt;
+    if (texfmt) {
+        bool ok;
+        switch (texfmt->type) {
+        case PL_FMT_SINT: ok = vartype == PL_VAR_SINT; break;
+        case PL_FMT_UINT: ok = vartype == PL_VAR_UINT; break;
+        default:          ok = vartype == PL_VAR_FLOAT; break;
+        }
+
+        if (!ok) {
+            PL_ERR(sh, "Specified texture format '%s' does not match LUT "
+                   "data type!", texfmt->name);
+            goto error;
+        }
+
+        if (~texfmt->caps & texcaps) {
+            PL_ERR(sh, "Specified texture format '%s' does not match "
+                   "required capabilities 0x%x!\n", texfmt->name, texcaps);
+            goto error;
+        }
+    }
+
+    if (texdim && !texfmt) {
+        texfmt = pl_find_fmt(gpu, fmt_type[vartype], params->comps,
+                             vartype == PL_VAR_FLOAT ? 16 : 32,
+                             pl_var_type_size(vartype) * 8,
+                             texcaps);
+    }
+
+    enum sh_lut_type type = params->lut_type;
+
+    // The linear sampling code currently only supports 1D linear interpolation
+    if (is_linear && dims > 1) {
+        if (texfmt) {
+            type = SH_LUT_TEXTURE;
+        } else {
+            PL_ERR(sh, "Can't emulate linear LUTs for 2D/3D LUTs and no "
+                  "texture support available!");
+            goto error;
+        }
+    }
+
+    bool can_uniform = gpu && gpu->limits.max_variable_comps >= size * params->comps;
+    bool can_literal = sh_glsl(sh).version > 110; // needed for literal arrays
+    can_literal &= size <= SH_LUT_MAX_LITERAL_HARD && !params->dynamic;
+
+    // Deselect unsupported methods
+    if (type == SH_LUT_UNIFORM && !can_uniform)
+        type = SH_LUT_AUTO;
+    if (type == SH_LUT_LITERAL && !can_literal)
+        type = SH_LUT_AUTO;
+    if (type == SH_LUT_TEXTURE && !texfmt)
+        type = SH_LUT_AUTO;
+
+    // Sorted by priority
+    if (!type && can_literal && !method && size <= SH_LUT_MAX_LITERAL_SOFT)
+        type = SH_LUT_LITERAL;
+    if (!type && texfmt)
+        type = SH_LUT_TEXTURE;
+    if (!type && can_uniform)
+        type = SH_LUT_UNIFORM;
+    if (!type && can_literal)
+        type = SH_LUT_LITERAL;
+
+    if (!type) {
+        PL_ERR(sh, "Can't generate LUT: no compatible methods!");
+        goto error;
+    }
+
+    // Reinitialize the existing LUT if needed
+    update |= type != lut->type;
+    update |= method != lut->method;
+
+    if (update) {
+        if (params->dynamic)
+            pl_log_level_cap(sh->log, PL_LOG_TRACE);
+
+        size_t el_size = params->comps * pl_var_type_size(vartype);
+        if (type == SH_LUT_TEXTURE)
+            el_size = texfmt->texel_size;
+
+        size_t buf_size = size * el_size;
+        if (pl_cache_get(params->cache, &obj) && obj.size == buf_size) {
+            PL_DEBUG(sh, "Re-using cached LUT (0x%"PRIx64") with size %zu",
+                     obj.key, obj.size);
+        } else {
+            PL_DEBUG(sh, "LUT invalidated, regenerating..");
+            pl_cache_obj_resize(NULL, &obj, buf_size);
+            pl_clock_t start = pl_clock_now();
+            params->fill(obj.data, params);
+            pl_log_cpu_time(sh->log, start, pl_clock_now(), "generating shader LUT");
+        }
+
+        pl_assert(obj.data && obj.size);
+        if (params->dynamic)
+            pl_log_level_cap(sh->log, PL_LOG_NONE);
+
+        switch (type) {
+        case SH_LUT_TEXTURE: {
+            if (!texdim) {
+                PL_ERR(sh, "Texture LUT exceeds texture dimensions!");
+                goto error;
+            }
+
+            if (!texfmt) {
+                PL_ERR(sh, "Found no compatible texture format for LUT!");
+                goto error;
+            }
+
+            struct pl_tex_params tex_params = {
+                .w              = params->width,
+                .h              = PL_DEF(params->height, texdim >= 2 ? 1 : 0),
+                .d              = PL_DEF(params->depth,  texdim >= 3 ? 1 : 0),
+                .format         = texfmt,
+                .sampleable     = true,
+                .host_writable  = params->dynamic,
+                .initial_data   = params->dynamic ? NULL : obj.data,
+                .debug_tag      = params->debug_tag,
+            };
+
+            bool ok;
+            if (params->dynamic) {
+                ok = pl_tex_recreate(gpu, &lut->tex, &tex_params);
+                if (ok) {
+                    ok = pl_tex_upload(gpu, pl_tex_transfer_params(
+                        .tex = lut->tex,
+                        .ptr = obj.data,
+                    ));
+                }
+            } else {
+                // Can't use pl_tex_recreate because of `initial_data`
+                pl_tex_destroy(gpu, &lut->tex);
+                lut->tex = pl_tex_create(gpu, &tex_params);
+                ok = lut->tex;
+            }
+
+            if (!ok) {
+                PL_ERR(sh, "Failed creating LUT texture!");
+                goto error;
+            }
+            break;
+        }
+
+        case SH_LUT_UNIFORM:
+            pl_free(lut->data);
+            lut->data = pl_memdup(NULL, obj.data, obj.size);
+            break;
+
+        case SH_LUT_LITERAL: {
+            lut->str.len = 0;
+            static const char prefix[PL_VAR_TYPE_COUNT] = {
+                [PL_VAR_SINT]   = 'i',
+                [PL_VAR_UINT]   = 'u',
+                [PL_VAR_FLOAT]  = ' ',
+            };
+
+            for (int i = 0; i < size * params->comps; i += params->comps) {
+                if (i > 0)
+                    pl_str_append_asprintf_c(lut, &lut->str, ",");
+                if (params->comps > 1) {
+                    pl_str_append_asprintf_c(lut, &lut->str, "%cvec%d(",
+                                             prefix[vartype], params->comps);
+                }
+                for (int c = 0; c < params->comps; c++) {
+                    switch (vartype) {
+                    case PL_VAR_FLOAT:
+                        pl_str_append_asprintf_c(lut, &lut->str, "%s%f",
+                                                 c > 0 ? "," : "",
+                                                 ((float *) obj.data)[i+c]);
+                        break;
+                    case PL_VAR_UINT:
+                        pl_str_append_asprintf_c(lut, &lut->str, "%s%u",
+                                                 c > 0 ? "," : "",
+                                                 ((unsigned int *) obj.data)[i+c]);
+                        break;
+                    case PL_VAR_SINT:
+                        pl_str_append_asprintf_c(lut, &lut->str, "%s%d",
+                                                 c > 0 ? "," : "",
+                                                 ((int *) obj.data)[i+c]);
+                        break;
+                    case PL_VAR_INVALID:
+                    case PL_VAR_TYPE_COUNT:
+                        pl_unreachable();
+                    }
+                }
+                if (params->comps > 1)
+                    pl_str_append_asprintf_c(lut, &lut->str, ")");
+            }
+            break;
+        }
+
+        case SH_LUT_AUTO:
+            pl_unreachable();
+        }
+
+        lut->type = type;
+        lut->method = method;
+        lut->vartype = vartype;
+        lut->fmt = params->fmt;
+        lut->width = params->width;
+        lut->height = params->height;
+        lut->depth = params->depth;
+        lut->comps = params->comps;
+        lut->signature = params->signature;
+        pl_cache_set(params->cache, &obj);
+    }
+
+    // Done updating, generate the GLSL
+    ident_t name = sh_fresh(sh, "lut");
+    ident_t arr_name = NULL_IDENT;
+
+    static const char * const swizzles[] = {"x", "xy", "xyz", "xyzw"};
+    static const char * const vartypes[PL_VAR_TYPE_COUNT][4] = {
+        [PL_VAR_SINT] = { "int", "ivec2", "ivec3", "ivec4" },
+        [PL_VAR_UINT] = { "uint", "uvec2", "uvec3", "uvec4" },
+        [PL_VAR_FLOAT] = { "float", "vec2", "vec3", "vec4" },
+    };
+
+    switch (type) {
+    case SH_LUT_TEXTURE: {
+        assert(texdim);
+        ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+            .desc = {
+                .name = "weights",
+                .type = PL_DESC_SAMPLED_TEX,
+            },
+            .binding = {
+                .object = lut->tex,
+                .sample_mode = is_linear ? PL_TEX_SAMPLE_LINEAR
+                                         : PL_TEX_SAMPLE_NEAREST,
+            }
+        });
+
+        if (is_linear) {
+            ident_t pos_macros[PL_ARRAY_SIZE(sizes)] = {0};
+            for (int i = 0; i < dims; i++)
+                pos_macros[i] = texel_scale(sh, sizes[i], true);
+
+            GLSLH("#define "$"(pos) (textureLod("$", %s(\\\n",
+                  name, tex, vartypes[PL_VAR_FLOAT][texdim - 1]);
+
+            for (int i = 0; i < texdim; i++) {
+                char sep = i == 0 ? ' ' : ',';
+                if (pos_macros[i]) {
+                    if (dims > 1) {
+                        GLSLH("   %c"$"(%s(pos).%c)\\\n", sep, pos_macros[i],
+                              vartypes[PL_VAR_FLOAT][dims - 1], "xyzw"[i]);
+                    } else {
+                        GLSLH("   %c"$"(float(pos))\\\n", sep, pos_macros[i]);
+                    }
+                } else {
+                    GLSLH("   %c%f\\\n", sep, 0.5);
+                }
+            }
+            GLSLH("  ), 0.0).%s)\n", swizzles[params->comps - 1]);
+        } else {
+            GLSLH("#define "$"(pos) (texelFetch("$", %s(pos",
+                  name, tex, vartypes[PL_VAR_SINT][texdim - 1]);
+
+            // Fill up extra components of the index
+            for (int i = dims; i < texdim; i++)
+                GLSLH(", 0");
+
+            GLSLH("), 0).%s)\n", swizzles[params->comps - 1]);
+        }
+        break;
+    }
+
+    case SH_LUT_UNIFORM:
+        arr_name = sh_var(sh, (struct pl_shader_var) {
+            .var = {
+                .name = "weights",
+                .type = vartype,
+                .dim_v = params->comps,
+                .dim_m = 1,
+                .dim_a = size,
+            },
+            .data = lut->data,
+        });
+        break;
+
+    case SH_LUT_LITERAL:
+        arr_name = sh_fresh(sh, "weights");
+        GLSLH("const %s "$"[%d] = %s[](\n  ",
+              vartypes[vartype][params->comps - 1], arr_name, size,
+              vartypes[vartype][params->comps - 1]);
+        sh_append_str(sh, SH_BUF_HEADER, lut->str);
+        GLSLH(");\n");
+        break;
+
+    case SH_LUT_AUTO:
+        pl_unreachable();
+    }
+
+    if (arr_name) {
+        GLSLH("#define "$"(pos) ("$"[int((pos)%s)\\\n",
+              name, arr_name, dims > 1 ? "[0]" : "");
+        int shift = params->width;
+        for (int i = 1; i < dims; i++) {
+            GLSLH("    + %d * int((pos)[%d])\\\n", shift, i);
+            shift *= sizes[i];
+        }
+        GLSLH("  ])\n");
+
+        if (is_linear) {
+            pl_assert(dims == 1);
+            pl_assert(vartype == PL_VAR_FLOAT);
+            ident_t arr_lut = name;
+            name = sh_fresh(sh, "lut_lin");
+            GLSLH("%s "$"(float fpos) {                             \n"
+                  "    fpos = clamp(fpos, 0.0, 1.0) * %d.0;         \n"
+                  "    float fbase = floor(fpos);                   \n"
+                  "    float fceil = ceil(fpos);                    \n"
+                  "    float fcoord = fpos - fbase;                 \n"
+                  "    return mix("$"(fbase), "$"(fceil), fcoord);  \n"
+                  "}                                                \n",
+                  vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+                  size - 1,
+                  arr_lut, arr_lut);
+        }
+    }
+
+    if (method == SH_LUT_CUBIC && dims == 3) {
+        ident_t lin_lut = name;
+        name = sh_fresh(sh, "lut_tricubic");
+        GLSLH("%s "$"(vec3 pos) {                                       \n"
+              "    vec3 scale = vec3(%d.0, %d.0, %d.0);                 \n"
+              "    vec3 scale_inv = 1.0 / scale;                        \n"
+              "    pos *= scale;                                        \n"
+              "    vec3 fpos = fract(pos);                              \n"
+              "    vec3 base = pos - fpos;                              \n"
+              "    vec3 fpos2 = fpos * fpos;                            \n"
+              "    vec3 inv = 1.0 - fpos;                               \n"
+              "    vec3 inv2 = inv * inv;                               \n"
+              "    vec3 w0 = 1.0/6.0 * inv2 * inv;                      \n"
+              "    vec3 w1 = 2.0/3.0 - 0.5 * fpos2 * (2.0 - fpos);      \n"
+              "    vec3 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv);        \n"
+              "    vec3 w3 = 1.0/6.0 * fpos2 * fpos;                    \n"
+              "    vec3 g0 = w0 + w1;                                   \n"
+              "    vec3 g1 = w2 + w3;                                   \n"
+              "    vec3 h0 = scale_inv * ((w1 / g0) - 1.0 + base);      \n"
+              "    vec3 h1 = scale_inv * ((w3 / g1) + 1.0 + base);      \n"
+              "    %s c000, c001, c010, c011, c100, c101, c110, c111;   \n"
+              "    c000 = "$"(h0);                                      \n"
+              "    c100 = "$"(vec3(h1.x, h0.y, h0.z));                  \n"
+              "    c000 = mix(c100, c000, g0.x);                        \n"
+              "    c010 = "$"(vec3(h0.x, h1.y, h0.z));                  \n"
+              "    c110 = "$"(vec3(h1.x, h1.y, h0.z));                  \n"
+              "    c010 = mix(c110, c010, g0.x);                        \n"
+              "    c000 = mix(c010, c000, g0.y);                        \n"
+              "    c001 = "$"(vec3(h0.x, h0.y, h1.z));                  \n"
+              "    c101 = "$"(vec3(h1.x, h0.y, h1.z));                  \n"
+              "    c001 = mix(c101, c001, g0.x);                        \n"
+              "    c011 = "$"(vec3(h0.x, h1.y, h1.z));                  \n"
+              "    c111 = "$"(h1);                                      \n"
+              "    c011 = mix(c111, c011, g0.x);                        \n"
+              "    c001 = mix(c011, c001, g0.y);                        \n"
+              "    return mix(c001, c000, g0.z);                        \n"
+              "}                                                        \n",
+              vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+              sizes[0] - 1, sizes[1] - 1, sizes[2] - 1,
+              vartypes[PL_VAR_FLOAT][params->comps - 1],
+              lin_lut, lin_lut, lin_lut, lin_lut,
+              lin_lut, lin_lut, lin_lut, lin_lut);
+    }
+
+    if (method == SH_LUT_TETRAHEDRAL) {
+        ident_t int_lut = name;
+        name = sh_fresh(sh, "lut_barycentric");
+        GLSLH("%s "$"(vec3 pos) {                                       \n"
+              // Compute bounding vertices and fractional part
+              "    pos = clamp(pos, 0.0, 1.0) * vec3(%d.0, %d.0, %d.0); \n"
+              "    vec3 base = floor(pos);                              \n"
+              "    vec3 fpart = pos - base;                             \n"
+              // v0 and v3 are always 'black' and 'white', respectively
+              // v1 and v2 are the closest RGB and CMY vertices, respectively
+              "    ivec3 v0 = ivec3(base), v3 = ivec3(ceil(pos));       \n"
+              "    ivec3 v1 = v0, v2 = v3;                              \n"
+              // Table of boolean checks to simplify following math
+              "    bvec3 c = greaterThanEqual(fpart.xyz, fpart.yzx);    \n"
+              "    bool c_xy = c.x, c_yx = !c.x,                        \n"
+              "       c_yz = c.y, c_zy = !c.y,                          \n"
+              "       c_zx = c.z, c_xz = !c.z;                          \n"
+              "    vec3 s = fpart.xyz;                                  \n"
+              "    bool cond;                                           \n",
+              vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+              sizes[0] - 1, sizes[1] - 1, sizes[2] - 1);
+
+        // Subdivision of the cube into six congruent tetrahedras
+        //
+        // For each tetrahedron, test if the point is inside, and if so, update
+        // the edge vertices. We test all six, even though only one case will
+        // ever be true, because this avoids branches.
+        static const char *indices[] = { "xyz", "xzy", "zxy", "zyx", "yzx", "yxz"};
+        for (int i = 0; i < PL_ARRAY_SIZE(indices); i++) {
+            const char x = indices[i][0], y = indices[i][1], z = indices[i][2];
+            GLSLH("cond = c_%c%c && c_%c%c;          \n"
+                  "s = cond ? fpart.%c%c%c : s;      \n"
+                  "v1.%c = cond ? v3.%c : v1.%c;     \n"
+                  "v2.%c = cond ? v0.%c : v2.%c;     \n",
+                  x, y, y, z,
+                  x, y, z,
+                  x, x, x,
+                  z, z, z);
+        }
+
+        // Interpolate in barycentric coordinates, with four texel fetches
+        GLSLH("    return (1.0 - s.x) * "$"(v0) +   \n"
+              "           (s.x - s.y) * "$"(v1) +   \n"
+              "           (s.y - s.z) * "$"(v2) +   \n"
+              "           (s.z)       * "$"(v3);    \n"
+              "}                                    \n",
+              int_lut, int_lut, int_lut, int_lut);
+    }
+
+    lut->error = false;
+    pl_cache_obj_free(&obj);
+    pl_assert(name);
+    return name;
+
+error:
+    lut->error = true;
+    pl_cache_obj_free(&obj);
+    return NULL_IDENT;
+}
diff --git a/src/shaders/meson.build b/src/shaders/meson.build
new file mode 100644
index 0000000..746747c
--- /dev/null
+++ b/src/shaders/meson.build
@@ -0,0 +1,23 @@
+shader_sources = [
+  'colorspace.c',
+  'custom.c',
+  'custom_mpv.c',
+  'deinterlacing.c',
+  'dithering.c',
+  'film_grain.c',
+  'film_grain_av1.c',
+  'film_grain_h274.c',
+  'icc.c',
+  'lut.c',
+  'sampling.c',
+]
+
+foreach s : shader_sources
+  sources += custom_target(s,
+    command: glsl_preproc,
+    depend_files: glsl_deps,
+    env: python_env,
+    input: s,
+    output: s,
+  )
+endforeach
diff --git a/src/shaders/sampling.c b/src/shaders/sampling.c
new file mode 100644
index 0000000..fc10f80
--- /dev/null
+++ b/src/shaders/sampling.c
@@ -0,0 +1,1198 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders/sampling.h>
+
+const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS };
+
+static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
+{
+    if (src->tex)
+        return src->tex->params;
+
+    return (struct pl_tex_params) {
+        .w = src->tex_w,
+        .h = src->tex_h,
+    };
+}
+
+enum filter {
+    NEAREST = PL_TEX_SAMPLE_NEAREST,
+    LINEAR  = PL_TEX_SAMPLE_LINEAR,
+    BEST,
+    FASTEST,
+};
+
+// Helper function to compute the src/dst sizes and upscaling ratios
+static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
+                      ident_t *src_tex, ident_t *pos, ident_t *pt,
+                      float *ratio_x, float *ratio_y, uint8_t *comp_mask,
+                      float *scale, bool resizeable,
+                      enum filter filter)
+{
+    enum pl_shader_sig sig;
+    float src_w, src_h;
+    enum pl_tex_sample_mode sample_mode;
+    if (src->tex) {
+        pl_fmt fmt = src->tex->params.format;
+        bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
+        pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
+        sig = PL_SHADER_SIG_NONE;
+        src_w = pl_rect_w(src->rect);
+        src_h = pl_rect_h(src->rect);
+        switch (filter) {
+        case FASTEST:
+        case NEAREST:
+            sample_mode = PL_TEX_SAMPLE_NEAREST;
+            break;
+        case LINEAR:
+            if (!can_linear) {
+                SH_FAIL(sh, "Trying to use a shader that requires linear "
+                        "sampling with a texture whose format (%s) does not "
+                        "support PL_FMT_CAP_LINEAR", fmt->name);
+                return false;
+            }
+            sample_mode = PL_TEX_SAMPLE_LINEAR;
+            break;
+        case BEST:
+            sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
+            break;
+        }
+    } else {
+        pl_assert(src->tex_w && src->tex_h);
+        sig = PL_SHADER_SIG_SAMPLER;
+        src_w = src->sampled_w;
+        src_h = src->sampled_h;
+        if (filter == BEST || filter == FASTEST) {
+            sample_mode = src->mode;
+        } else {
+            sample_mode = (enum pl_tex_sample_mode) filter;
+            if (sample_mode != src->mode) {
+                SH_FAIL(sh, "Trying to use a shader that requires a different "
+                        "filter mode than the external sampler.");
+                return false;
+            }
+        }
+    }
+
+    src_w = PL_DEF(src_w, src_params(src).w);
+    src_h = PL_DEF(src_h, src_params(src).h);
+    pl_assert(src_w && src_h);
+
+    int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
+    int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
+    pl_assert(out_w && out_h);
+
+    if (ratio_x)
+        *ratio_x = out_w / fabs(src_w);
+    if (ratio_y)
+        *ratio_y = out_h / fabs(src_h);
+    if (scale)
+        *scale = PL_DEF(src->scale, 1.0);
+
+    if (comp_mask) {
+        uint8_t tex_mask = 0x0Fu;
+        if (src->tex) {
+            // Mask containing only the number of components in the texture
+            tex_mask = (1 << src->tex->params.format->num_components) - 1;
+        }
+
+        uint8_t src_mask = src->component_mask;
+        if (!src_mask)
+            src_mask = (1 << PL_DEF(src->components, 4)) - 1;
+
+        // Only actually sample components that are both requested and
+        // available in the texture being sampled
+        *comp_mask = tex_mask & src_mask;
+    }
+
+    if (resizeable)
+        out_w = out_h = 0;
+    if (!sh_require(sh, sig, out_w, out_h))
+        return false;
+
+    if (src->tex) {
+        pl_rect2df rect = {
+            .x0 = src->rect.x0,
+            .y0 = src->rect.y0,
+            .x1 = src->rect.x0 + src_w,
+            .y1 = src->rect.y0 + src_h,
+        };
+
+        *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
+                           "src_tex", &rect, pos, pt);
+    } else {
+        if (pt) {
+            float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
+            if (src->sampler == PL_SAMPLER_RECT)
+                sx = sy = 1.0;
+
+            *pt = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_vec2("tex_pt"),
+                .data = &(float[2]) { sx, sy },
+            });
+        }
+
+        sh->sampler_type = src->sampler;
+
+        pl_assert(src->format);
+        switch (src->format) {
+        case PL_FMT_UNKNOWN:
+        case PL_FMT_FLOAT:
+        case PL_FMT_UNORM:
+        case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
+        case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
+        case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
+        case PL_FMT_TYPE_COUNT:
+            pl_unreachable();
+        }
+
+        *src_tex = sh_fresh(sh, "src_tex");
+        *pos     = sh_fresh(sh, "pos");
+
+        GLSLH("#define "$" src_tex  \n"
+              "#define "$" pos      \n",
+              *src_tex, *pos);
+    }
+
+    return true;
+}
+
+void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
+                      const struct pl_deband_params *params)
+{
+    float scale;
+    ident_t tex, pos, pt;
+    uint8_t mask;
+    if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, LINEAR))
+        return;
+
+    params = PL_DEF(params, &pl_deband_default_params);
+    sh_describe(sh, "debanding");
+    GLSL("vec4 color;                       \n"
+         "// pl_shader_deband               \n"
+         "{                                 \n"
+         "vec2 pos = "$", pt = "$";         \n"
+         "color = textureLod("$", pos, 0.0);\n",
+         pos, pt, tex);
+
+    mask &= ~0x8u; // ignore alpha channel
+    uint8_t num_comps = sh_num_comps(mask);
+    const char *swiz = sh_swizzle(mask);
+    pl_assert(num_comps <= 3);
+    if (!num_comps) {
+        GLSL("color *= "$"; \n"
+             "}             \n",
+             SH_FLOAT(scale));
+        return;
+    }
+
+    GLSL("#define GET(X, Y)                                   \\\n"
+         "    (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s)  \n"
+         "#define T %s                                          \n",
+         tex, swiz, sh_float_type(mask));
+
+    ident_t prng = sh_prng(sh, true, NULL);
+    GLSL("T avg, diff, bound;   \n"
+         "T res = color.%s;     \n"
+         "vec2 d;               \n",
+         swiz);
+
+    if (params->iterations > 0) {
+        ident_t radius = sh_const_float(sh, "radius", params->radius);
+        ident_t threshold = sh_const_float(sh, "threshold",
+                                           params->threshold / (1000 * scale));
+
+        // For each iteration, compute the average at a given distance and
+        // pick it instead of the color if the difference is below the threshold.
+        for (int i = 1; i <= params->iterations; i++) {
+            GLSL(// Compute a random angle and distance
+                 "d = "$".xy * vec2(%d.0 * "$", %f);    \n"
+                 "d = d.x * vec2(cos(d.y), sin(d.y));   \n"
+                 // Sample at quarter-turn intervals around the source pixel
+                 "avg = T(0.0);                         \n"
+                 "avg += GET(+d.x, +d.y);               \n"
+                 "avg += GET(-d.x, +d.y);               \n"
+                 "avg += GET(-d.x, -d.y);               \n"
+                 "avg += GET(+d.x, -d.y);               \n"
+                 "avg *= 0.25;                          \n"
+                 // Compare the (normalized) average against the pixel
+                 "diff = abs(res - avg);                \n"
+                 "bound = T("$" / %d.0);                \n",
+                 prng, i, radius, M_PI * 2,
+                 threshold, i);
+
+            if (num_comps > 1) {
+                GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
+            } else {
+                GLSL("res = mix(avg, res, diff > bound); \n");
+            }
+        }
+    }
+
+    // Add some random noise to smooth out residual differences
+    if (params->grain > 0) {
+        // Avoid adding grain near true black
+        GLSL("bound = T(\n");
+        for (int c = 0; c < num_comps; c++) {
+            GLSL("%c"$, c > 0 ? ',' : ' ',
+                 SH_FLOAT(params->grain_neutral[c] / scale));
+        }
+        GLSL(");                                        \n"
+             "T strength = min(abs(res - bound), "$");  \n"
+             "res += strength * (T("$") - T(0.5));      \n",
+             SH_FLOAT(params->grain / (1000.0 * scale)), prng);
+    }
+
+    GLSL("color.%s = res;   \n"
+         "color *= "$";     \n"
+         "#undef T          \n"
+         "#undef GET        \n"
+         "}                 \n",
+         swiz, SH_FLOAT(scale));
+}
+
+bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
+{
+    float scale;
+    ident_t tex, pos;
+    if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST))
+        return false;
+
+    GLSL("// pl_shader_sample_direct                            \n"
+         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
+         SH_FLOAT(scale), tex, pos);
+    return true;
+}
+
+bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
+{
+    float scale;
+    ident_t tex, pos;
+    if (!setup_src(sh, src, &tex, &pos,  NULL, NULL, NULL, NULL, &scale, true, NEAREST))
+        return false;
+
+    sh_describe(sh, "nearest");
+    GLSL("// pl_shader_sample_nearest                           \n"
+         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
+         SH_FLOAT(scale), tex, pos);
+    return true;
+}
+
+bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
+{
+    float scale;
+    ident_t tex, pos;
+    if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR))
+        return false;
+
+    sh_describe(sh, "bilinear");
+    GLSL("// pl_shader_sample_bilinear                          \n"
+         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
+         SH_FLOAT(scale), tex, pos);
+    return true;
+}
+
+bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+
+    sh_describe(sh, "bicubic");
+#pragma GLSL /* pl_shader_sample_bicubic */         \
+    vec4 color;                                     \
+    {                                               \
+    vec2 pos = $pos;                                \
+    vec2 size = vec2(textureSize($tex, 0));         \
+    vec2 frac  = fract(pos * size + vec2(0.5));     \
+    vec2 frac2 = frac * frac;                       \
+    vec2 inv   = vec2(1.0) - frac;                  \
+    vec2 inv2  = inv * inv;                         \
+    /* compute filter weights directly */           \
+    vec2 w0 = 1.0/6.0 * inv2 * inv;                 \
+    vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+    vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);  \
+    vec2 w3 = 1.0/6.0 * frac2 * frac;               \
+    vec4 g = vec4(w0 + w1, w2 + w3);                \
+    vec4 h = vec4(w1, w3) / g + inv.xyxy;           \
+    h.xy -= vec2(2.0);                              \
+    /* sample four corners, then interpolate */     \
+    vec4 p = pos.xyxy + $pt.xyxy * h;               \
+    vec4 c00 = textureLod($tex, p.xy, 0.0);         \
+    vec4 c01 = textureLod($tex, p.xw, 0.0);         \
+    vec4 c0 = mix(c01, c00, g.y);                   \
+    vec4 c10 = textureLod($tex, p.zy, 0.0);         \
+    vec4 c11 = textureLod($tex, p.zw, 0.0);         \
+    vec4 c1 = mix(c11, c10, g.y);                   \
+    color = ${float:scale} * mix(c1, c0, g.x);      \
+    }
+
+    return true;
+}
+
+bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast hermite sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    sh_describe(sh, "hermite");
+#pragma GLSL /* pl_shader_sample_hermite */              \
+    vec4 color;                                          \
+    {                                                    \
+    vec2 pos  = $pos;                                    \
+    vec2 size = vec2(textureSize($tex, 0));              \
+    vec2 frac = fract(pos * size + vec2(0.5));           \
+    pos += $pt * (smoothstep(0.0, 1.0, frac) - frac);    \
+    color = ${float:scale} * textureLod($tex, pos, 0.0); \
+    }
+
+    return true;
+}
+
+bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    sh_describe(sh, "gaussian");
+#pragma GLSL /* pl_shader_sample_gaussian */        \
+    vec4 color;                                     \
+    {                                               \
+    vec2 pos  = $pos;                               \
+    vec2 size = vec2(textureSize($tex, 0));         \
+    vec2 off  = -fract(pos * size + vec2(0.5));     \
+    vec2 off2 = -2.0 * off * off;                   \
+    /* compute gaussian weights */                  \
+    vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0));    \
+    vec2 w1 = exp(off2);                            \
+    vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0));    \
+    vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0));    \
+    vec4 g = vec4(w0 + w1, w2 + w3);                \
+    vec4 h = vec4(w1, w3) / g;                      \
+    h.xy -= vec2(1.0);                              \
+    h.zw += vec2(1.0);                              \
+    g.xy /= g.xy + g.zw; /* explicitly normalize */ \
+    /* sample four corners, then interpolate */     \
+    vec4 p = pos.xyxy + $pt.xyxy * (h + off.xyxy);  \
+    vec4 c00 = textureLod($tex, p.xy, 0.0);         \
+    vec4 c01 = textureLod($tex, p.xw, 0.0);         \
+    vec4 c0 = mix(c01, c00, g.y);                   \
+    vec4 c10 = textureLod($tex, p.zy, 0.0);         \
+    vec4 c11 = textureLod($tex, p.zw, 0.0);         \
+    vec4 c1 = mix(c11, c10, g.y);                   \
+    color = ${float:scale} * mix(c1, c0, g.x);      \
+    }
+
+    return true;
+}
+
+bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
+                                 float threshold)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    threshold = PL_CLAMP(threshold, 0.0f, 0.5f);
+    sh_describe(sh, "oversample");
+    #pragma GLSL /* pl_shader_sample_oversample */       \
+    vec4 color;                                          \
+    {                                                    \
+    vec2 pos = $pos;                                     \
+    vec2 size = vec2(textureSize($tex, 0));              \
+    /* Round the position to the nearest pixel */        \
+    vec2 fcoord = fract(pos * size - vec2(0.5));         \
+    float rx = ${dynamic float:rx};                      \
+    float ry = ${dynamic float:ry};                      \
+    vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry);    \
+    coeff = clamp(coeff + vec2(0.5), 0.0, 1.0);          \
+    @if (threshold > 0) {                                \
+        float thresh = ${float:threshold};               \
+        coeff = mix(coeff, vec2(0.0),                    \
+            lessThan(coeff, vec2(thresh)));              \
+        coeff = mix(coeff, vec2(1.0),                    \
+            greaterThan(coeff, vec2(1.0 - thresh)));     \
+    @}                                                   \
+                                                         \
+    /* Compute the right output blend of colors */       \
+    pos += (coeff - fcoord) * $pt;                       \
+    color = ${float:scale} * textureLod($tex, pos, 0.0); \
+    }
+
+    return true;
+}
+
+static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg,
+                            const char *stage, float rx, float ry)
+{
+    const char *dir;
+    if (rx > 1 && ry > 1) {
+        dir = "up";
+    } else if (rx < 1 && ry < 1) {
+        dir = "down";
+    } else if (rx == 1 && ry == 1) {
+        dir = "noop";
+    } else {
+        dir = "ana";
+    }
+
+    if (cfg->name) {
+        sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name);
+    } else if (cfg->window) {
+        sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir,
+                     PL_DEF(cfg->kernel->name, "unknown"),
+                     PL_DEF(cfg->window->name, "unknown"));
+    } else {
+        sh_describef(sh, "%s %sscaling (%s)", stage, dir,
+                     PL_DEF(cfg->kernel->name, "unknown"));
+    }
+}
+
+// Subroutine for computing and adding an individual texel contribution
+// If `in` is NULL, samples directly
+// If `in` is set, takes the pixel from inX[idx] where X is the component,
+// `in` is the given identifier, and `idx` must be defined by the caller
+static void polar_sample(pl_shader sh, pl_filter filter,
+                         ident_t tex, ident_t lut, ident_t radius,
+                         int x, int y, uint8_t comp_mask, ident_t in,
+                         bool use_ar, ident_t scale)
+{
+    // Since we can't know the subpixel position in advance, assume a
+    // worst case scenario
+    int yy = y > 0 ? y-1 : y;
+    int xx = x > 0 ? x-1 : x;
+    float dmin = sqrt(xx*xx + yy*yy);
+    // Skip samples definitely outside the radius
+    if (dmin >= filter->radius)
+        return;
+
+    // Check for samples that might be skippable
+    bool maybe_skippable = dmin >= filter->radius - M_SQRT2;
+
+    // Check for samples that definitely won't contribute to anti-ringing
+    const float ar_radius = filter->radius_zero;
+    use_ar &= dmin < ar_radius;
+
+#pragma GLSL                                                    \
+    offset = ivec2(${const int: x}, ${const int: y});           \
+    d = length(vec2(offset) - fcoord);                          \
+    @if (maybe_skippable)                                       \
+        if (d < $radius) {                                      \
+    w = $lut(d * 1.0 / $radius);                                \
+    wsum += w;                                                  \
+    @if (in != NULL_IDENT) {                                    \
+        @for (c : comp_mask)                                    \
+            c[@c] = ${in}_@c[idx];                              \
+    @} else {                                                   \
+        c = textureLod($tex, base + pt * vec2(offset), 0.0);    \
+    @}                                                          \
+    @for (c : comp_mask)                                        \
+        color[@c] += w * c[@c];                                 \
+    @if (use_ar) {                                              \
+        if (d <= ${const float: ar_radius}) {                   \
+            @for (c : comp_mask) {                              \
+                cc = vec2($scale * c[@c]);                      \
+                cc.x = 1.0 - cc.x;                              \
+                ww = cc + vec2(0.10);                           \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = w * ww;                                    \
+                ar@c += ww * cc;                                \
+                wwsum@c += ww;                                  \
+            @}                                                  \
+        }                                                       \
+    @}                                                          \
+    @if (maybe_skippable)                                       \
+        }
+}
+
+struct sh_sampler_obj {
+    pl_filter filter;
+    pl_shader_obj lut;
+    pl_shader_obj pass2; // for pl_shader_sample_ortho
+};
+
+#define SCALER_LUT_SIZE     256
+#define SCALER_LUT_CUTOFF   1e-3f
+
+static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_sampler_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->lut);
+    pl_shader_obj_destroy(&obj->pass2);
+    pl_filter_free(&obj->filter);
+    *obj = (struct sh_sampler_obj) {0};
+}
+
+static void fill_polar_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct sh_sampler_obj *obj = params->priv;
+    pl_filter filt = obj->filter;
+
+    pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
+    memcpy(data, filt->weights, params->width * sizeof(float));
+}
+
+bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
+                            const struct pl_sample_filter_params *params)
+{
+    pl_assert(params);
+    if (!params->filter.polar) {
+        SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
+        return false;
+    }
+
+    uint8_t cmask;
+    float rx, ry, scalef;
+    ident_t src_tex, pos, pt, scale;
+    if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST))
+        return false;
+
+    struct sh_sampler_obj *obj;
+    obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
+                 sh_sampler_uninit);
+    if (!obj)
+        return false;
+
+    float inv_scale = 1.0 / PL_MIN(rx, ry);
+    inv_scale = PL_MAX(inv_scale, 1.0);
+    if (params->no_widening)
+        inv_scale = 1.0;
+    scale = sh_const_float(sh, "scale", scalef);
+
+    struct pl_filter_config cfg = params->filter;
+    cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+    cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+    bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+    if (update) {
+        pl_filter_free(&obj->filter);
+        obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+            .config         = cfg,
+            .lut_entries    = SCALER_LUT_SIZE,
+            .cutoff         = SCALER_LUT_CUTOFF,
+        ));
+
+        if (!obj->filter) {
+            // This should never happen, but just in case ..
+            SH_FAIL(sh, "Failed initializing polar filter!");
+            return false;
+        }
+    }
+
+    describe_filter(sh, &cfg, "polar", rx, ry);
+    GLSL("// pl_shader_sample_polar                     \n"
+         "vec4 color = vec4(0.0);                       \n"
+         "{                                             \n"
+         "vec2 pos = "$", pt = "$";                     \n"
+         "vec2 size = vec2(textureSize("$", 0));        \n"
+         "vec2 fcoord = fract(pos * size - vec2(0.5));  \n"
+         "vec2 base = pos - pt * fcoord;                \n"
+         "vec2 center = base + pt * vec2(0.5);          \n"
+         "ivec2 offset;                                 \n"
+         "float w, d, wsum = 0.0;                       \n"
+         "int idx;                                      \n"
+         "vec4 c;                                       \n",
+         pos, pt, src_tex);
+
+    bool use_ar = cfg.antiring > 0;
+    if (use_ar) {
+#pragma GLSL                                                                    \
+        vec2 ww, cc;                                                            \
+        @for (c : cmask)                                                        \
+            vec2 ar@c = vec2(0.0), wwsum@c = vec2(0.0);
+    }
+
+    int bound   = ceil(obj->filter->radius);
+    int offset  = bound - 1; // padding top/left
+    int padding = offset + bound; // total padding
+
+    // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
+    // good tradeoff for the horizontal work group size. Apart from that,
+    // just use as many threads as possible.
+    const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
+
+    // We need to sample everything from base_min to base_max, so make sure we
+    // have enough room in shmem. The extra margin on the ceilf guards against
+    // floating point inaccuracy on near-integer scaling ratios.
+    const float margin = 1e-5;
+    int iw = (int) ceilf(bw / rx - margin) + padding + 1,
+        ih = (int) ceilf(bh / ry - margin) + padding + 1;
+    int sizew = iw, sizeh = ih;
+
+    pl_gpu gpu = SH_GPU(sh);
+    bool dynamic_size = SH_PARAMS(sh).dynamic_constants ||
+                        !gpu || !gpu->limits.array_size_constants;
+    if (dynamic_size) {
+        // Overallocate the array slightly to reduce recompilation overhead
+        sizew = PL_ALIGN2(sizew, 8);
+        sizeh = PL_ALIGN2(sizeh, 8);
+    }
+
+    int num_comps = __builtin_popcount(cmask);
+    int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float);
+    bool is_compute = !params->no_compute && sh_glsl(sh).compute &&
+                      sh_try_compute(sh, bw, bh, false, shmem_req);
+
+    // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by
+    // much, and it's catastrophically slow on other platforms.
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = &obj->lut,
+        .lut_type   = SH_LUT_TEXTURE,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_LINEAR,
+        .width      = SCALER_LUT_SIZE,
+        .comps      = 1,
+        .update     = update,
+        .fill       = fill_polar_lut,
+        .priv       = obj,
+    ));
+
+    if (!lut) {
+        SH_FAIL(sh, "Failed initializing polar LUT!");
+        return false;
+    }
+
+    ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
+    ident_t in = sh_fresh(sh, "in");
+
+    if (is_compute) {
+
+        // Compute shader kernel
+        GLSL("uvec2 base_id = uvec2(0u); \n");
+        if (src->rect.x0 > src->rect.x1)
+            GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n");
+        if (src->rect.y0 > src->rect.y1)
+            GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n");
+
+        GLSLH("shared vec2 "$"_base; \n", in);
+        GLSL("if (gl_LocalInvocationID.xy == base_id)               \n"
+             "    "$"_base = base;                                  \n"
+             "barrier();                                            \n"
+             "ivec2 rel = ivec2(round((base - "$"_base) * size));   \n",
+             in, in);
+
+        ident_t sizew_c = sh_const(sh, (struct pl_shader_const) {
+            .type = PL_VAR_SINT,
+            .compile_time = true,
+            .name = "sizew",
+            .data = &sizew,
+        });
+
+        ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) {
+            .type = PL_VAR_SINT,
+            .compile_time = true,
+            .name = "sizeh",
+            .data = &sizeh,
+        });
+
+        ident_t iw_c = sizew_c, ih_c = sizeh_c;
+        if (dynamic_size) {
+            iw_c = sh_const_int(sh, "iw", iw);
+            ih_c = sh_const_int(sh, "ih", ih);
+        }
+
+        // Load all relevant texels into shmem
+        GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) {     \n"
+             "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) {     \n"
+             "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0);   \n",
+             ih_c, bh, iw_c, bw, src_tex, in, offset, offset);
+
+        for (uint8_t comps = cmask; comps;) {
+            uint8_t c = __builtin_ctz(comps);
+            GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c);
+            GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c);
+            comps &= ~(1 << c);
+        }
+
+        GLSL("}}                     \n"
+             "barrier();             \n");
+
+        // Dispatch the actual samples
+        for (int y = 1 - bound; y <= bound; y++) {
+            for (int x = 1 - bound; x <= bound; x++) {
+                GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n",
+                     sizew_c, sizew_c, y + offset, x + offset);
+                polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+                             x, y, cmask, in, use_ar, scale);
+            }
+        }
+    } else {
+        // Fragment shader sampling
+        for (uint8_t comps = cmask; comps;) {
+            uint8_t c = __builtin_ctz(comps);
+            GLSL("vec4 "$"_%d; \n", in, c);
+            comps &= ~(1 << c);
+        }
+
+        // For maximum efficiency, we want to use textureGather() if
+        // possible, rather than direct sampling. Since this is not
+        // always possible/sensible, we need to possibly intermix gathering
+        // with regular sampling. This requires keeping track of which
+        // pixels in the next row were already gathered by the previous
+        // row.
+        uint32_t gathered_cur = 0x0, gathered_next = 0x0;
+        const float radius2 = PL_SQUARE(obj->filter->radius);
+        const int base = bound - 1;
+
+        if (base + bound >= 8 * sizeof(gathered_cur)) {
+            SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
+                    obj->filter->radius);
+            return false;
+        }
+
+        for (int y = 1 - bound; y <= bound; y++) {
+            for (int x = 1 - bound; x <= bound; x++) {
+                // Skip already gathered texels
+                uint32_t bit = 1llu << (base + x);
+                if (gathered_cur & bit)
+                    continue;
+
+                // Using texture gathering is only more efficient than direct
+                // sampling in the case where we expect to be able to use all
+                // four gathered texels, without having to discard any. So
+                // only do it if we suspect it will be a win rather than a
+                // loss.
+                int xx = x*x, xx1 = (x+1)*(x+1);
+                int yy = y*y, yy1 = (y+1)*(y+1);
+                bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
+                use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
+                use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
+                use_gather &= !src->tex || src->tex->params.format->gatherable;
+
+                // Gathering from components other than the R channel requires
+                // support for GLSL 400, which introduces the overload of
+                // textureGather* that allows specifying the component.
+                //
+                // This is also the minimum requirement if we don't know the
+                // texture format capabilities, for the sampler2D interface
+                if (cmask != 0x1 || !src->tex)
+                    use_gather &= sh_glsl(sh).version >= 400;
+
+                if (!use_gather) {
+                    // Switch to direct sampling instead
+                    polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+                                 x, y, cmask, NULL_IDENT, use_ar, scale);
+                    continue;
+                }
+
+                // Gather the four surrounding texels simultaneously
+                for (uint8_t comps = cmask; comps;) {
+                    uint8_t c = __builtin_ctz(comps);
+                    if (x || y) {
+                        if (c) {
+                            GLSL($"_%d = textureGatherOffset("$", "
+                                 "center, ivec2(%d, %d), %d); \n",
+                                 in, c, src_tex, x, y, c);
+                        } else {
+                            GLSL($"_0 = textureGatherOffset("$", "
+                                 "center, ivec2(%d, %d)); \n",
+                                 in, src_tex, x, y);
+                        }
+                    } else {
+                        if (c) {
+                            GLSL($"_%d = textureGather("$", center, %d); \n",
+                                 in, c, src_tex, c);
+                        } else {
+                            GLSL($"_0 = textureGather("$", center); \n",
+                                 in, src_tex);
+                        }
+                    }
+                    comps &= ~(1 << c);
+                }
+
+                // Mix in all of the points with their weights
+                for (int p = 0; p < 4; p++) {
+                    // The four texels are gathered counterclockwise starting
+                    // from the bottom left
+                    static const int xo[4] = {0, 1, 1, 0};
+                    static const int yo[4] = {1, 1, 0, 0};
+                    if (x+xo[p] > bound || y+yo[p] > bound)
+                        continue; // next subpixel
+
+                    GLSL("idx = %d;\n", p);
+                    polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+                                 x+xo[p], y+yo[p], cmask, in, use_ar, scale);
+                }
+
+                // Mark the other next row's pixels as already gathered
+                gathered_next |= bit | (bit << 1);
+                x++; // skip adjacent pixel
+            }
+
+            // Prepare for new row
+            gathered_cur = gathered_next;
+            gathered_next = 0;
+        }
+    }
+
+#pragma GLSL                                                                    \
+    color = $scale / wsum * color;                                              \
+    @if (use_ar) {                                                              \
+        @for (c : cmask) {                                                      \
+            ww = ar@c / wwsum@c;                                                \
+            ww.x = 1.0 - ww.x;                                                  \
+            w = clamp(color[@c], ww.x, ww.y);                                   \
+            w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y);                        \
+            color[@c] = mix(color[@c], w, ${float:cfg.antiring});               \
+        @}                                                                      \
+    @}                                                                          \
+    @if (!(cmask & (1 << PL_CHANNEL_A)))                                        \
+        color.a = 1.0;                                                          \
+    }
+
+    return true;
+}
+
+static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct sh_sampler_obj *obj = params->priv;
+    pl_filter filt = obj->filter;
+
+    if (filt->radius == filt->radius_zero) {
+        // Main lobe covers entire radius, so all weights are positive, meaning
+        // we can use the linear resampling trick
+        for (int n = 0; n < SCALER_LUT_SIZE; n++) {
+            const float *weights = filt->weights + n * filt->row_stride;
+            float *row = (float *) data + n * filt->row_stride;
+            pl_assert(filt->row_size % 2 == 0);
+            for (int i = 0; i < filt->row_size; i += 2) {
+                const float w0 = weights[i], w1 = weights[i+1];
+                assert(w0 + w1 >= 0.0f);
+                row[i] = w0 + w1;
+                row[i+1] = w1 / (w0 + w1);
+            }
+        }
+    } else {
+        size_t entries = SCALER_LUT_SIZE * filt->row_stride;
+        pl_assert(params->width * params->height * params->comps == entries);
+        memcpy(data, filt->weights, entries * sizeof(float));
+    }
+}
+
+enum {
+    SEP_VERT = 0,
+    SEP_HORIZ,
+    SEP_PASSES
+};
+
+bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
+                             const struct pl_sample_filter_params *params)
+{
+    pl_assert(params);
+    if (params->filter.polar) {
+        SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
+        return false;
+    }
+
+    pl_gpu gpu = SH_GPU(sh);
+    pl_assert(gpu);
+
+    uint8_t comps;
+    float ratio[SEP_PASSES], scale;
+    ident_t src_tex, pos, pt;
+    if (!setup_src(sh, src, &src_tex, &pos, &pt,
+                   &ratio[SEP_HORIZ], &ratio[SEP_VERT],
+                   &comps, &scale, false, LINEAR))
+        return false;
+
+
+    int pass;
+    if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) {
+        pass = SEP_VERT;
+    } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) {
+        pass = SEP_HORIZ;
+    } else {
+        SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a "
+                "pl_sample_src that requires scaling in multiple directions "
+                "(rx=%f, ry=%f), this is not possible!",
+                ratio[SEP_HORIZ], ratio[SEP_VERT]);
+        return false;
+    }
+
+    // We can store a separate sampler object per dimension, so dispatch the
+    // right one. This is needed for two reasons:
+    // 1. Anamorphic content can have a different scaling ratio for each
+    //    dimension. In particular, you could be upscaling in one and
+    //    downscaling in the other.
+    // 2. After fixing the source for `setup_src`, we lose information about
+    //    the scaling ratio of the other component. (Although this is only a
+    //    minor reason and could easily be changed with some boilerplate)
+    struct sh_sampler_obj *obj;
+    obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
+                 struct sh_sampler_obj, sh_sampler_uninit);
+    if (!obj)
+        return false;
+
+    if (pass != 0) {
+        obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
+                     struct sh_sampler_obj, sh_sampler_uninit);
+        assert(obj);
+    }
+
+    float inv_scale = 1.0 / ratio[pass];
+    inv_scale = PL_MAX(inv_scale, 1.0);
+    if (params->no_widening)
+        inv_scale = 1.0;
+
+    struct pl_filter_config cfg = params->filter;
+    cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+    cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+    bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+
+    if (update) {
+        pl_filter_free(&obj->filter);
+        obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+            .config             = cfg,
+            .lut_entries        = SCALER_LUT_SIZE,
+            .max_row_size       = gpu->limits.max_tex_2d_dim / 4,
+            .row_stride_align   = 4,
+        ));
+
+        if (!obj->filter) {
+            // This should never happen, but just in case ..
+            SH_FAIL(sh, "Failed initializing separated filter!");
+            return false;
+        }
+    }
+
+    int N = obj->filter->row_size; // number of samples to convolve
+    int width = obj->filter->row_stride / 4; // width of the LUT texture
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = &obj->lut,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_LINEAR,
+        .width      = width,
+        .height     = SCALER_LUT_SIZE,
+        .comps      = 4,
+        .update     = update,
+        .fill       = fill_ortho_lut,
+        .priv       = obj,
+    ));
+    if (!lut) {
+        SH_FAIL(sh, "Failed initializing separated LUT!");
+        return false;
+    }
+
+    const int dir[SEP_PASSES][2] = {
+        [SEP_HORIZ] = {1, 0},
+        [SEP_VERT]  = {0, 1},
+    };
+
+    static const char *names[SEP_PASSES] = {
+        [SEP_HORIZ] = "ortho (horiz)",
+        [SEP_VERT]  = "ortho (vert)",
+    };
+
+    describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]);
+
+    float denom = PL_MAX(1, width - 1); // avoid division by zero
+    bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0;
+    bool use_linear = obj->filter->radius == obj->filter->radius_zero;
+    use_ar &= !use_linear; // filter has no negative weights
+
+#pragma GLSL /* pl_shader_sample_ortho */                                       \
+    vec4 color = vec4(0.0, 0.0, 0.0, 1.0);                                      \
+    {                                                                           \
+    vec2 pos = $pos, pt = $pt;                                                  \
+    vec2 size = vec2(textureSize($src_tex, 0));                                 \
+    vec2 dir = vec2(${const float:dir[pass][0]}, ${const float: dir[pass][1]}); \
+    pt *= dir;                                                                  \
+    vec2 fcoord2 = fract(pos * size - vec2(0.5));                               \
+    float fcoord = dot(fcoord2, dir);                                           \
+    vec2 base = pos - fcoord * pt - pt * vec2(${const float: N / 2 - 1});       \
+    vec4 ws;                                                                    \
+    float off;                                                                  \
+    ${vecType: comps} c, ca = ${vecType: comps}(0.0);                           \
+    @if (use_ar) {                                                              \
+        ${vecType: comps} hi = ${vecType: comps}(0.0);                          \
+        ${vecType: comps} lo = ${vecType: comps}(1e9);                          \
+    @}                                                                          \
+    @for (n < N) {                                                              \
+        @if @(n % 4 == 0)                                                       \
+            ws = $lut(vec2(float(@n / 4) / ${const float: denom}, fcoord));     \
+        @if @(vars.use_ar && (n == vars.n / 2 - 1 || n == vars.n / 2)) {        \
+            c = textureLod($src_tex, base + pt * @n.0, 0.0).${swizzle: comps};  \
+            ca += ws[@n % 4] * c;                                               \
+            lo = min(lo, c);                                                    \
+            hi = max(hi, c);                                                    \
+        @} else {                                                               \
+            @if (use_linear) {                                                  \
+                @if @(n % 2 == 0) {                                             \
+                    off = @n.0 + ws[@n % 4 + 1];                                \
+                    ca += ws[@n % 4] * textureLod($src_tex, base + pt * off,    \
+                                                  0.0).${swizzle: comps};       \
+                @}                                                              \
+            @} else {                                                           \
+                ca += ws[@n % 4] * textureLod($src_tex, base + pt * @n.0,       \
+                                              0.0).${swizzle: comps};           \
+            @}                                                                  \
+        @}                                                                      \
+    @}                                                                          \
+    @if (use_ar)                                                                \
+        ca = mix(ca, clamp(ca, lo, hi), ${float: cfg.antiring});                \
+    color.${swizzle: comps} = ${float: scale} * ca;                             \
+    }
+
+    return true;
+}
+
+const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS };
+
+void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h,
+                       const struct pl_distort_params *params)
+{
+    pl_assert(params);
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+        return;
+
+    const int src_w = src_tex->params.w, src_h = src_tex->params.h;
+    float rx = 1.0f, ry = 1.0f;
+    if (src_w > src_h) {
+        ry = (float) src_h / src_w;
+    } else {
+        rx = (float) src_w / src_h;
+    }
+
+    // Map from texel coordinates [0,1]² to aspect-normalized representation
+    const pl_transform2x2 tex2norm = {
+        .mat.m = {
+            { 2 * rx, 0 },
+            { 0, -2 * ry },
+        },
+        .c = { -rx, ry },
+    };
+
+    // Map from aspect-normalized representation to canvas coords [-1,1]²
+    const float sx = params->unscaled ? (float) src_w / out_w : 1.0f;
+    const float sy = params->unscaled ? (float) src_h / out_h : 1.0f;
+    const pl_transform2x2 norm2canvas = {
+        .mat.m = {
+            { sx / rx, 0 },
+            { 0, sy / ry },
+        },
+    };
+
+    struct pl_transform2x2 transform = params->transform;
+    pl_transform2x2_mul(&transform, &tex2norm);
+    pl_transform2x2_rmul(&norm2canvas, &transform);
+
+    if (params->constrain) {
+        pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) {
+            .x1 = 1, .y1 = 1,
+        });
+        const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f);
+        pl_transform2x2_scale(&transform, 2.0f / k);
+    };
+
+    // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond
+    // to normal mathematical axis conventions
+    static const pl_rect2df canvas = {
+        .x0 = -1.0f, .x1 =  1.0f,
+        .y0 =  1.0f, .y1 = -1.0f,
+    };
+
+    ident_t pos = sh_attr_vec2(sh, "pos", &canvas);
+    ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode,
+                              PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt);
+
+    // Bind the inverse of the tex2canvas transform (i.e. canvas2tex)
+    pl_transform2x2_invert(&transform);
+    ident_t tf = sh_var(sh, (struct pl_shader_var) {
+        .var  = pl_var_mat2("tf"),
+        .data = PL_TRANSPOSE_2X2(transform.mat.m),
+    });
+
+    ident_t tf_c = sh_var(sh, (struct pl_shader_var) {
+        .var  = pl_var_vec2("tf_c"),
+        .data = transform.c,
+    });
+
+    // See pl_shader_sample_bicubic
+    sh_describe(sh, "distortion");
+#pragma GLSL /* pl_shader_sample_distort */                 \
+    vec4 color;                                             \
+    {                                                       \
+    vec2 pos = $tf * $pos + $tf_c;                          \
+    vec2 pt = $pt;                                          \
+    @if (params->bicubic) {                                 \
+        vec2 size = vec2(textureSize($tex, 0));             \
+        vec2 frac  = fract(pos * size + vec2(0.5));         \
+        vec2 frac2 = frac * frac;                           \
+        vec2 inv   = vec2(1.0) - frac;                      \
+        vec2 inv2  = inv * inv;                             \
+        vec2 w0 = 1.0/6.0 * inv2 * inv;                     \
+        vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);     \
+        vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);      \
+        vec2 w3 = 1.0/6.0 * frac2 * frac;                   \
+        vec4 g = vec4(w0 + w1, w2 + w3);                    \
+        vec4 h = vec4(w1, w3) / g + inv.xyxy;               \
+        h.xy -= vec2(2.0);                                  \
+        vec4 p = pos.xyxy + pt.xyxy * h;                    \
+        vec4 c00 = textureLod($tex, p.xy, 0.0);             \
+        vec4 c01 = textureLod($tex, p.xw, 0.0);             \
+        vec4 c0 = mix(c01, c00, g.y);                       \
+        vec4 c10 = textureLod($tex, p.zy, 0.0);             \
+        vec4 c11 = textureLod($tex, p.zw, 0.0);             \
+        vec4 c1 = mix(c11, c10, g.y);                       \
+        color = mix(c1, c0, g.x);                           \
+    @} else {                                               \
+        color = texture($tex, pos);                         \
+    @}                                                      \
+    @if (params->alpha_mode) {                              \
+        vec2 border = min(pos, vec2(1.0) - pos);            \
+        border = smoothstep(vec2(0.0), pt, border);         \
+        @if (params->alpha_mode == PL_ALPHA_PREMULTIPLIED)  \
+            color.rgba *= border.x * border.y;              \
+        @else                                               \
+            color.a *= border.x * border.y;                 \
+    @}                                                      \
+    }
+
+}