8 files changed, 2982 insertions, 0 deletions
diff --git a/docs/CNAME b/docs/CNAME
new file mode 100644
index 0000000..3be539d
--- /dev/null
+++ b/docs/CNAME
@@ -0,0 +1 @@
+libplacebo.org
diff --git a/docs/basic-rendering.md b/docs/basic-rendering.md
new file mode 100644
index 0000000..09a1f6b
--- /dev/null
+++ b/docs/basic-rendering.md
@@ -0,0 +1,432 @@
+# Basic windowing / output example
+
+We will demonstrate the basics of the libplacebo GPU output API with a worked
+example. The goal is to show a simple color on screen.
+
+## Creating a `pl_log`
+
+Almost all major entry-points into libplacebo require providing a log
+callback (or `NULL` to disable logging). This is abstracted into the `pl_log`
+object type, which we can create with
+`pl_log_create`:
+
+``` c linenums="1"
+#include <libplacebo/log.h>
+
+pl_log pllog;
+
+int main()
+{
+    pllog = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb = pl_log_color,
+        .log_level = PL_LOG_INFO,
+    ));
+
+    // ...
+
+    pl_log_destroy(&pllog);
+    return 0;
+}
+```
+
+!!! note "Compiling"
+
+    You can compile this example with:
+
+    ``` bash
+    $ gcc example.c -o example `pkg-config --cflags --libs libplacebo`
+    ```
+
+The parameter `PL_API_VER` has no special significance and is merely included
+for historical reasons. Aside from that, this snippet introduces a number of
+core concepts of the libplacebo API:
+
+### Parameter structs
+
+For extensibility, almost all libplacebo calls take a pointer to a `const
+struct pl_*_params`, into which all extensible parameters go. For convenience,
+libplacebo provides macros which create anonymous params structs on the stack
+(and also fill in default parameters). Note that this only works for C99 and
+above, users of C89 and C++ must initialize parameter structs manually.
+
+Under the hood, `pl_log_params(...)` just translates to `&((struct
+pl_log_params) { /* default params */, ... })`. This style of API allows
+libplacebo to effectively simulate optional named parameters.
+
+!!! note "On default parameters"
+
+    Wherever possible, parameters are designed in such a way that `{0}` gives
+    you a minimal parameter structure, with default behavior and no optional
+    features enabled. This is done for forwards compatibility - as new
+    features are introduced, old struct initializers will simply opt out of
+    them.
+
+### Destructors
+
+All libplacebo objects must be destroyed manually using the corresponding
+`pl_*_destroy` call, which takes a pointer to the variable the object is
+stored in. The resulting variable is written to `NULL`. This helps prevent
+use-after-free bugs.
+
+!!! note "NULL"
+
+    As a general rule, all libplacebo destructors are safe to call on
+    variables containing `NULL`. So, users need not explicitly `NULL`-test
+    before calling destructors on variables.
+
+## Creating a window
+
+While libplacebo can work in isolation, to render images offline, for the sake
+of this guide we want to provide something graphical on-screen. As such, we
+need to create some sort of window. Libplacebo provides no built-in mechanism
+for this, it assumes the API user will already have a windowing system
+in-place.
+
+Complete examples (based on GLFW and SDL) can be found [in the libplacebo
+demos](https://code.videolan.org/videolan/libplacebo/-/tree/master/demos). But
+for now, we will focus on getting a very simple window on-screen using GLFW:
+
+``` c linenums="1" hl_lines="3 5 6 7 9 17 18 20 21 22 24 25 26 28 29"
+// ...
+
+#include <GLFW/glfw3.h>
+
+const char * const title = "libplacebo demo";
+int width = 800;
+int height = 600;
+
+GLFWwindow *window;
+
+int main()
+{
+    pllog = pl_log_create(PL_API_VER, pl_log_params(
+        .log_level = PL_LOG_INFO,
+    ));
+
+    if (!glfwInit())
+        return 1;
+
+    window = glfwCreateWindow(width, height, title, NULL, NULL);
+    if (!window)
+        return 1;
+
+    while (!glfwWindowShouldClose(window)) {
+        glfwWaitEvents();
+    }
+
+    glfwDestroyWindow(window);
+    glfwTerminate();
+    pl_log_destroy(&pllog);
+    return 0;
+}
+```
+
+!!! note "Compiling"
+
+    We now also need to include the glfw3 library to compile this example.
+
+    ``` bash
+    $ gcc example.c -o example `pkg-config --cflags --libs glfw3 libplacebo`
+    ```
+
+## Creating the `pl_gpu`
+
+All GPU operations are abstracted into an internal `pl_gpu` object, which
+serves as the primary entry-point to any sort of GPU interaction. This object
+cannot be created directly, but must be obtained from some graphical API:
+currently there are Vulkan, OpenGL or D3D11. A `pl_gpu` can be accessed from
+an API-specific object like `pl_vulkan`, `pl_opengl` and `pl_d3d11`.
+
+In this guide, for simplicity, we will be using OpenGL, simply because that's
+what GLFW initializes by default.
+
+``` c linenums="1" hl_lines="3 5-6 15-23 29 36-45"
+// ...
+
+pl_opengl opengl;
+
+static bool make_current(void *priv);
+static void release_current(void *priv);
+
+int main()
+{
+    // ...
+    window = glfwCreateWindow(width, height, title, NULL, NULL);
+    if (!window)
+        return 1;
+
+    opengl = pl_opengl_create(pllog, pl_opengl_params(
+        .get_proc_addr      = glfwGetProcAddress,
+        .allow_software     = true,         // allow software rasterers
+        .debug              = true,         // enable error reporting
+        .make_current       = make_current, // (1)
+        .release_current    = release_current,
+    ));
+    if (!opengl)
+        return 2;
+
+    while (!glfwWindowShouldClose(window)) {
+        glfwWaitEvents();
+    }
+
+    pl_opengl_destroy(&opengl);
+    glfwDestroyWindow(window);
+    glfwTerminate();
+    pl_log_destroy(&pllog);
+    return 0;
+}
+
+static bool make_current(void *priv)
+{
+    glfwMakeContextCurrent(window);
+    return true;
+}
+
+static void release_current(void *priv)
+{
+    glfwMakeContextCurrent(NULL);
+}
+```
+
+1.  Setting this allows the resulting `pl_gpu` to be thread-safe, which
+    enables asynchronous transfers to be used. The alternative is to simply
+    call `glfwMakeContextCurrent` once after creating the window.
+
+    This method of making the context current is generally preferred,
+    however, so we've demonstrated it here for completeness' sake.
+
+## Creating a swapchain
+
+All access to window-based rendering commands are abstracted into an object
+known as a "swapchain" (from Vulkan terminology), including the default
+backbuffers on D3D11 and OpenGL. If we want to present something to screen,
+we need to first create a `pl_swapchain`.
+
+We can use this swapchain to perform the equivalent of `gl*SwapBuffers`:
+
+``` c linenums="1" hl_lines="2 4-9 17-22 24-27 30-31 34"
+// ...
+pl_swapchain swchain;
+
+static void resize_cb(GLFWwindow *win, int new_w, int new_h)
+{
+    width  = new_w;
+    height = new_h;
+    pl_swapchain_resize(swchain, &width, &height);
+}
+
+int main()
+{
+    // ...
+    if (!opengl)
+        return 2;
+
+    swchain = pl_opengl_create_swapchain(opengl, pl_opengl_swapchain_params(
+        .swap_buffers   = (void (*)(void *)) glfwSwapBuffers,
+        .priv           = window,
+    ));
+    if (!swchain)
+        return 2;
+
+    // (2)
+    if (!pl_swapchain_resize(swchain, &width, &height))
+        return 2;
+    glfwSetFramebufferSizeCallback(window, resize_cb);
+
+    while (!glfwWindowShouldClose(window)) {
+        pl_swapchain_swap_buffers(swchain);
+        glfwPollEvents(); // (1)
+    }
+
+    pl_swapchain_destroy(&swchain);
+    pl_opengl_destroy(&opengl);
+    glfwDestroyWindow(window);
+    glfwTerminate();
+    pl_log_destroy(&pllog);
+    return 0;
+}
+```
+
+1.  We change this from `glfwWaitEvents` to `glfwPollEvents` because
+    we now want to re-run our main loop once per vsync, rather than only when
+    new events arrive.  The `pl_swapchain_swap_buffers` call will ensure
+    that this does not execute too quickly.
+
+2.  The swapchain needs to be resized to fit the size of the window, which in
+    GLFW is handled by listening to a callback. In addition to setting this
+    callback, we also need to inform the swapchain of the initial window size.
+
+    Note that the `pl_swapchain_resize` function handles both resize requests
+    and size queries - hence, the actual swapchain size is returned back to
+    the passed variables.
+
+## Getting pixels on the screen
+
+With a swapchain in hand, we're now equipped to start drawing pixels to the
+screen:
+
+``` c linenums="1" hl_lines="3-8 15-20"
+// ...
+
+static void render_frame(struct pl_swapchain_frame frame)
+{
+    pl_gpu gpu = opengl->gpu;
+
+    pl_tex_clear(gpu, frame.fbo, (float[4]){ 1.0, 0.5, 0.0, 1.0 });
+}
+
+int main()
+{
+    // ...
+
+    while (!glfwWindowShouldClose(window)) {
+        struct pl_swapchain_frame frame;
+        while (!pl_swapchain_start_frame(swchain, &frame))
+            glfwWaitEvents(); // (1)
+        render_frame(frame);
+        if (!pl_swapchain_submit_frame(swchain))
+            break; // (2)
+
+        pl_swapchain_swap_buffers(swchain);
+        glfwPollEvents();
+    }
+
+    // ...
+}
+```
+
+1.  If `pl_swapchain_start_frame` fails, it typically means the window is
+    hidden, minimized or blocked. This is not a fatal condition, and as such
+    we simply want to process window events until we can resume rendering.
+
+2.  If `pl_swapchain_submit_frame` fails, it typically means the window has
+    been lost, and further rendering commands are not expected to succeed.
+    As such, in this case, we simply terminate the example program.
+
+Our main render loop has changed into a combination of
+`pl_swapchain_start_frame`, rendering, and `pl_swapchain_submit_frame`. To
+start with, we simply use the `pl_tex_clear` function to blit a constant
+orange color to the framebuffer.
+
+### Interlude: Rendering commands
+
+The previous code snippet represented our first foray into the `pl_gpu` API.
+For more detail on this API, see the [GPU API](#TODO) section. But as a
+general rule of thumb, all `pl_gpu`-level operations are thread safe,
+asynchronous (except when returning something to the CPU), and internally
+refcounted (so you can destroy all objects as soon as you no longer need the
+reference).
+
+In the example loop, `pl_swapchain_swap_buffers` is the only operation that
+actually flushes commands to the GPU. You can force an early flush with
+`pl_gpu_flush()` or `pl_gpu_finish()`, but other than that, commands will
+"queue" internally and complete asynchronously at some unknown point in time,
+until forward progress is needed (e.g. `pl_tex_download`).
+
+## Conclusion
+
+We have demonstrated how to create a window, how to initialize the libplacebo
+API, create a GPU instance based on OpenGL, and how to write a basic rendering
+loop that blits a single color to the framebuffer.
+
+Here is a complete transcript of the example we built in this section:
+
+??? example "Basic rendering"
+    ``` c linenums="1"
+    #include <GLFW/glfw3.h>
+    
+    #include <libplacebo/log.h>
+    #include <libplacebo/opengl.h>
+    #include <libplacebo/gpu.h>
+    
+    const char * const title = "libplacebo demo";
+    int width = 800;
+    int height = 600;
+    
+    GLFWwindow *window;
+    
+    pl_log pllog;
+    pl_opengl opengl;
+    pl_swapchain swchain;
+    
+    static bool make_current(void *priv);
+    static void release_current(void *priv);
+    
+    static void resize_cb(GLFWwindow *win, int new_w, int new_h)
+    {
+        width  = new_w;
+        height = new_h;
+        pl_swapchain_resize(swchain, &width, &height);
+    }
+    
+    static void render_frame(struct pl_swapchain_frame frame)
+    {
+        pl_gpu gpu = opengl->gpu;
+    
+        pl_tex_clear(gpu, frame.fbo, (float[4]){ 1.0, 0.5, 0.0, 1.0 });
+    }
+    
+    int main()
+    {
+        pllog = pl_log_create(PL_API_VER, pl_log_params(
+            .log_cb = pl_log_color,
+            .log_level = PL_LOG_INFO,
+        ));
+    
+        if (!glfwInit())
+            return 1;
+    
+        window = glfwCreateWindow(width, height, title, NULL, NULL);
+        if (!window)
+            return 1;
+    
+        opengl = pl_opengl_create(pllog, pl_opengl_params(
+            .get_proc_addr      = glfwGetProcAddress,
+            .allow_software     = true,         // allow software rasterers
+            .debug              = true,         // enable error reporting
+            .make_current       = make_current,
+            .release_current    = release_current,
+        ));
+    
+        swchain = pl_opengl_create_swapchain(opengl, pl_opengl_swapchain_params(
+            .swap_buffers   = (void (*)(void *)) glfwSwapBuffers,
+            .priv           = window,
+        ));
+        if (!swchain)
+            return 2;
+    
+        if (!pl_swapchain_resize(swchain, &width, &height))
+            return 2;
+        glfwSetFramebufferSizeCallback(window, resize_cb);
+    
+        while (!glfwWindowShouldClose(window)) {
+            struct pl_swapchain_frame frame;
+            while (!pl_swapchain_start_frame(swchain, &frame))
+                glfwWaitEvents();
+            render_frame(frame);
+            if (!pl_swapchain_submit_frame(swchain))
+                break;
+    
+            pl_swapchain_swap_buffers(swchain);
+            glfwPollEvents();
+        }
+    
+        pl_swapchain_destroy(&swchain);
+        pl_opengl_destroy(&opengl);
+        glfwDestroyWindow(window);
+        glfwTerminate();
+        pl_log_destroy(&pllog);
+        return 0;
+    }
+    
+    static bool make_current(void *priv)
+    {
+        glfwMakeContextCurrent(window);
+        return true;
+    }
+    
+    static void release_current(void *priv)
+    {
+        glfwMakeContextCurrent(NULL);
+    }
+    ```
diff --git a/docs/custom-shaders.md b/docs/custom-shaders.md
new file mode 100644
index 0000000..c6dc107
--- /dev/null
+++ b/docs/custom-shaders.md
@@ -0,0 +1,729 @@
+# Custom Shaders (mpv .hook syntax)
+
+libplacebo supports the same [custom shader syntax used by
+mpv](https://mpv.io/manual/master/#options-glsl-shader), with some important
+changes. This document will serve as a complete reference for this syntax.
+
+## Overview
+
+In general, user shaders are divided into distinct *blocks*. Each block can
+define a shader, a texture, a buffer, or a tunable parameter. Each block
+starts with a collection of header directives, which are lines starting with
+the syntax `//!`.
+
+As an example, here is a simple shader that simply inverts the video signal:
+
+``` glsl linenums="1"
+//!HOOK LUMA
+//!HOOK RGB
+//!BIND HOOKED
+
+vec4 hook()
+{
+    vec4 color = HOOKED_texOff(0);
+    color.rgb = vec3(1.0) - color.rgb;
+    return color;
+}
+```
+
+This shader defines one block - a shader block which hooks into the two
+texture stages `LUMA` and `RGB`, binds the hooked texture, inverts the value
+of the `rgb` channels, and then returns the modified color.
+
+### Expressions
+
+In a few contexts, shader directives accept arithmetic expressions, denoted by
+`<expr>` in the listing below. For historical reasons, all expressions are
+given in [reverse polish notation
+(RPN)](https://en.wikipedia.org/wiki/Reverse_Polish_notation), and the only
+value type is a floating point number. The following value types and
+arithmetic operations are available:
+
+* `1.234`: Literal float constant, evaluates to itself.
+* `NAME.w`, `NAME.width`: Evaluates to the width of a texture with name `NAME`.
+* `NAME.h`, `NAME.height`: Evaluates to the height of a texture with name `NAME`.
+* `PAR`: Evaluates to the value of a tunable shader parameter with name `PAR`.
+* `+`: Evaluates to `X+Y`.
+* `-`: Evaluates to `X-Y`.
+* `*`: Evaluates to `X*Y`.
+* `/`: Evaluates to `X/Y`.
+* `%`: Evaluates to `fmod(X, Y)`.
+* `>`: Evaluates to `(X > Y) ? 1.0 : 0.0`.
+* `<`: Evaluates to `(X < Y) ? 1.0 : 0.0`.
+* `=`: Evaluates to `fuzzy_eq(X, Y) ? 1.0 : 0.0`, with some tolerance to
+  allow for floating point inaccuracy. (Around 1 ppm)
+* `!`: Evaluates to `X ? 0.0 : 1.0`.
+
+Note that `+` and `*` can be used as suitable replacements for the otherwise
+absent boolean logic expressions (`||` and `&&`).
+
+## Shaders
+
+Shaders are the default block type, and have no special syntax to indicate
+their presence. Shader stages contain raw GLSL code that will be
+(conditionally) executed. This GLSL snippet must define a single function
+`vec4 hook()`, or `void hook()` for compute shaders.
+
+During the execution of any shader, the following global variables are made
+available:
+
+* `int frame`: A raw counter tracking the number of executions of this shader
+  stage.
+* `float random`: A pseudo-random float uniformly distributed in the range
+  `[0,1)`.
+* `vec2 input_size`: The nominal size (in pixels) of the original input image.
+* `vec2 target_size`: The nominal size (in pixels) of the output rectangle.
+* `vec2 tex_offset`: The nominal offset (in pixels), of the original input crop.
+* `vec4 linearize(vec4 color)`: Linearize the input color according to the
+  image's tagged gamma function.
+* `vec4 delinearize(vec4 color)`: Opposite counterpart to `linearize`.
+
+Shader stages accept the following directives:
+
+### `HOOK <texture>`
+
+A `HOOK` directive determines when a shader stage is run. During internal
+processing, libplacebo goes over a number of pre-defined *hook points* at set
+points in the processing pipeline. It is only possible to intercept the image,
+and run custom shaders, at these fixed hook points.
+
+Here is a current list of hook points:
+
+* `RGB`: Input plane containing RGB values
+* `LUMA`: Input plane containing a Y value
+* `CHROMA`: Input plane containing chroma values (one or both)
+* `ALPHA`: Input plane containing a single alpha value
+* `XYZ`: Input plane containing XYZ values
+* `CHROMA_SCALED`: Chroma plane, after merging and upscaling to luma size
+* `ALPHA_SCALED`: Alpha plane, after upscaling to luma size
+* `NATIVE`: Merged input planes, before any sort of color conversion (as-is)
+* `MAIN`: After conversion to RGB, before linearization/scaling
+* `LINEAR`: After conversion to linear light (for scaling purposes)
+* `SIGMOID`: After conversion to sigmoidized light (for scaling purposes)
+* `PREKERNEL`: Immediately before the execution of the main scaler kernel
+* `POSTKERNEL`: Immediately after the execution of the main scaler kernel
+* `SCALED`: After scaling, in either linear or non-linear light RGB
+* `PREOUTPUT`: After color conversion to target colorspace, before alpha blending
+* `OUTPUT`: After alpha blending, before dithering and final output pass
+
+!!! warning "`MAINPRESUB`"
+    In mpv, `MAIN` and `MAINPRESUB` are separate shader stages, because the
+    mpv option `--blend-subtitles=video` allows rendering overlays directly
+    onto the pre-scaled video stage. libplacebo does not support this feature,
+    and as such, the `MAINPRESUB` shader stage does not exist. It is still
+    valid to refer to this name in shaders, but it is handled identically to
+    `MAIN`.
+
+It's possible for a hook point to never fire. For example, `SIGMOID` will not
+fire when downscaling, as sigmoidization only happens when upscaling.
+Similarly, `LUMA`/`CHROMA` will not fire on an RGB video and vice versa.
+
+A single shader stage may hook multiple hook points simultaneously, for
+example, to cover both `LUMA` and `RGB` cases with the same logic. (See the
+example shader in the introduction)
+
+### `BIND <texture>`
+
+The `BIND` directive makes a texture available for use in the shader. This can
+be any of the previously named hook points, a custom texture define by a
+`TEXTURE` block, a custom texture saved by a `SAVE` directive, or the special
+value `HOOKED` which allows binding whatever texture hook dispatched this
+shader stage.
+
+A bound texture will define the following GLSL functions (as macros):
+
+* `sampler2D NAME_raw`: A reference to the raw texture sampler itself.
+* `vec2 NAME_pos`: The texel coordinates of the current pixel.
+* `vec2 NAME_map(ivec2 id)`: A function that maps from `gl_GlobalInvocationID`
+  to texel coordinates. (Compute shaders)
+* `vec2 NAME_size`: The size (in pixels) of the texture.
+* `vec2 NAME_pt`: Convenience macro for `1.0 / NAME_size`. The size of a
+  single pixel (in texel coordinates).
+* `vec2 NAME_off`: The sample offset of the texture. Basically, the pixel
+  coordinates of the top-left corner of the sampled area.
+* `float NAME_mul`: The coefficient that must be multiplied into sampled
+  values in order to rescale them to `[0,1]`.
+* `vec4 NAME_tex(vec2 pos)`: A wrapper around `NAME_mul * textureLod(NAME_raw,
+  pos, 0.0)`.
+* `vec4 NAME_texOff(vec2 offset)`: A wrapper around `NAME_tex(NAME_pos + NAME_pt * offset)`.
+  This can be used to easily access adjacent pixels, e.g. `NAME_texOff(-1,2)`
+  samples a pixel one to the left and two to the bottom of the current
+  location.
+* `vec4 NAME_gather(vec2 pos, int c)`: A wrapper around
+  `NAME_mul * textureGather(pos, c)`, with appropriate scaling. (Only when
+  supported[^ifdef])
+
+!!! note "Rotation matrix"
+    For compatibility with mpv, we also define a `mat2 NAME_rot` which is
+    simply equal to a 2x2 identity matrix. libplacebo never rotates input
+    planes - all rotation happens during the final output to the display.
+
+[^ifdef]: Because these are macros, their presence can be tested for using
+  `#ifdef` inside the GLSL preprocessor.
+
+This same directive can also be used to bind buffer blocks (i.e.
+uniform/storage buffers), as defined by the [`BUFFER` directive](#buffer-name).
+
+### `SAVE <texture>`
+
+By default, after execution of a shader stage, the resulting output is
+captured back into the same hooked texture that triggered the shader. This
+behavior can be overridden using the explicit `SAVE` directive. For example,
+a shader might need access to a low-res version of the luma input texture in
+order to process chroma:
+
+``` glsl linenums="1"
+//!HOOK CHROMA
+//!BIND CHROMA
+//!BIND LUMA
+//!SAVE LUMA_LOWRES
+//!WIDTH CHROMA.w
+//!HEIGHT CHROMA.h
+
+vec4 hook()
+{
+    return LUMA_texOff(0);
+}
+```
+
+This shader binds both luma and chroma and resizes the luma plane down to the
+size of the chroma plane, saving the result as a new texture `LUMA_LOWRES`. In
+general, you can pick any name you want, here.
+
+### `DESC <description>`
+
+This purely informative directive simply gives the shader stage a name. This
+is the name that will be reported to the shader stage and execution time
+metrics.
+
+### `OFFSET <xo yo | ALIGN>`
+
+This directive indicates a pixel shift (offset) introduced by this pass. These
+pixel offsets will be accumulated and corrected automatically as part of plane
+alignment / main scaling.
+
+A special value of `ALIGN` will attempt to counteract any existing offset of
+the hooked texture by aligning it with reference plane (i.e. luma). This can
+be used to e.g. introduce custom chroma scaling in a way that doesn't break
+chroma subtexel offsets.
+
+An example:
+
+``` glsl linenums="1"
+//!HOOK LUMA
+//!BIND HOOKED
+//!OFFSET 100.5 100.5
+
+vec4 hook()
+{
+    // Constant offset by N pixels towards the bottom right
+    return HOOKED_texOff(-vec2(100.5));
+}
+```
+
+This (slightly silly) shader simply shifts the entire sampled region to the
+bottom right by 100.5 pixels, and propagates this shift to the main scaler
+using the `OFFSET` directive. As such, the end result of this is that there is
+no visible shift of the overall image, but some detail (~100 pixels) near the
+bottom-right border is lost due to falling outside the bounds of the texture.
+
+### `WIDTH <expr>`, `HEIGHT <expr>`
+
+These directives can be used to override the dimensions of the resulting
+texture. Note that not all textures can be resized this way. Currently, only
+`RGB`, `LUMA`, `CHROMA`, `XYZ`, `NATIVE` and `MAIN` are resizable. Trying to
+save a texture with an incompatible size to any other shader stage will result
+in an error.
+
+### `WHEN <expr>`
+
+This directive takes an expression that can be used to make shader stages
+conditionally executed. If this evaluates to 0, the shader stage will be
+skipped.
+
+Example:
+
+``` glsl linenums="1"
+//!PARAM strength
+//!TYPE float
+//!MINIMUM 0
+1.0
+
+//!HOOK MAIN
+//!BIND HOOKED
+//!WHEN intensity 0 >
+//!DESC do something based on 'intensity'
+...
+```
+
+This example defines a shader stage that only conditionally executes itself
+if the value of the `intensity` shader parameter is non-zero.
+
+### `COMPONENTS <num>`
+
+This directive overrides the number of components present in a texture.
+For example, if you want to extract a one-dimensional feature map from the
+otherwise 3 or 4 dimensional `MAIN` texture, you can use this directive to
+save on memory bandwidth and consumption by having libplacebo only allocate a
+one-component texture to store the feature map in:
+
+``` glsl linenums="1"
+//!HOOK MAIN
+//!BIND HOOKED
+//!SAVE featuremap
+//!COMPONENTS 1
+```
+
+### `COMPUTE <bw> <bh> [<tw> <th>]`
+
+This directive specifies that the shader should be treated as a compute
+shader, with the block size `bw` and `bh`. The compute shader will be
+dispatched with however many blocks are necessary to completely tile over the
+output. Within each block, there will be `tw*th` threads, forming a single
+work group. In other words: `tw` and `th` specify the work group size, which
+can be different from the block size. So for example, a compute shader with
+`bw = bh = 32` and `tw = th = 8` running on a `500x500` texture would dispatch
+`16x16` blocks (rounded up), each with `8x8` threads.
+
+Instead of defining a `vec4 hook()`, compute shaders must define a `void
+hook()` which results directly to the output texture, a `writeonly image2D
+out_image` made available to the shader stage.
+
+For example, here is a shader executing a single-pass 41x41 convolution
+(average blur) on the luma plane, using a compute shader to share sampling
+work between adjacent threads in a work group:
+
+``` glsl linenums="1"
+//!HOOK LUMA
+//!BIND HOOKED
+//!COMPUTE 32 32
+//!DESC avg convolution
+
+// Kernel size, 41x41 as an example
+const ivec2 ksize = ivec2(41, 41);
+const ivec2 offset = ksize / 2;
+
+// We need to load extra source texels to account for padding due to kernel
+// overhang
+const ivec2 isize = ivec2(gl_WorkGroupSize) + ksize - 1;
+
+shared float inp[isize.y][isize.x];
+
+void hook()
+{
+    // load texels into shmem
+    ivec2 base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
+    for (uint y = gl_LocalInvocationID.y; y < isize.y; y += gl_WorkGroupSize.y) {
+        for (uint x = gl_LocalInvocationID.x; x < isize.x; x += gl_WorkGroupSize.x)
+            inp[y][x] = texelFetch(HOOKED_raw, base + ivec2(x,y) - offset, 0).x;
+    }
+
+    // synchronize threads
+    barrier();
+
+    // do convolution
+    float sum;
+    for (uint y = 0; y < ksize.y; y++) {
+        for (uint x = 0; x < ksize.x; x++)
+            sum += inp[gl_LocalInvocationID.y+y][gl_LocalInvocationID.x+x];
+    }
+
+    vec4 color = vec4(HOOKED_mul * sum / (ksize.x * ksize.y), 0, 0, 1);
+    imageStore(out_image, ivec2(gl_GlobalInvocationID), color);
+}
+```
+
+## Textures
+
+Custom textures can be defined and made available to shader stages using
+`TEXTURE` blocks. These can be used to provide e.g. LUTs or pre-trained
+weights.
+
+The data for a texture is provided as a raw hexadecimal string encoding the
+in-memory representation of a texture, according to its given texture format,
+for example:
+
+``` glsl linenums="1"
+//!TEXTURE COLORS
+//!SIZE 3 3
+//!FORMAT rgba32f
+//!FILTER NEAREST
+//!BORDER REPEAT
+0000803f000000000000000000000000000000000000803f00000000000000000000000
+0000000000000803f00000000000000000000803f0000803f000000000000803f000000
+000000803f000000000000803f0000803f00000000000000009a99993e9a99993e9a999
+93e000000009a99193F9A99193f9a99193f000000000000803f0000803f0000803f0000
+0000
+```
+
+Texture blocks accept the following directives:
+
+### `TEXTURE <name>`
+
+This must be the first directive in a texture block, and marks it as such. The
+name given is the name that the texture will be referred to (via `BIND`
+directives).
+
+### `SIZE <width> [<height> [<depth>]]`
+
+This directive gives the size of the texture, as integers. For example,
+`//!SIZE 512 512` marks a 512x512 texture block. Textures can be 1D, 2D or 3D
+depending on the number of coordinates specified.
+
+### `FORMAT <fmt>`
+
+This directive specifies the texture format. A complete list of known textures
+is exposed as part of the `pl_gpu` struct metadata, but they follow the format
+convention `rgba8`, `rg16hf`, `rgba32f`, `r64i` and so on.
+
+### `FILTER <LINEAR | NEAREST>`
+
+This directive specifies the texture magnification/minification filter.
+
+### `BORDER <CLAMP | REPEAT | MIRROR>`
+
+This directive specifies the border clamping method of the texture.
+
+### `STORAGE`
+
+If present, this directive marks the texture as a storage image. It will still
+be initialized with the initial values, but rather than being bound as a
+read-only and immutable `sampler2D`, it is bound as a `readwrite coherent
+image2D`. Such texture scan be used to, for example, store persistent state
+across invocations of the shader.
+
+## Buffers
+
+Custom uniform / storage shader buffer  blocks can be defined using `BUFFER`
+directives.
+
+The (initial) data for a buffer is provided as a raw hexadecimal string
+encoding the in-memory representation of a buffer in the corresponding GLSL
+packing layout (std140 or std430 for uniform and storage blocks,
+respectively):
+
+``` glsl linenums="1"
+//!BUFFER buf_uniform
+//!VAR float foo
+//!VAR float bar
+0000000000000000
+
+//!BUFFER buf_storage
+//!VAR vec2 bat
+//!VAR int big[32];
+//!STORAGE
+```
+
+Buffer blocks accept the following directives:
+
+### `BUFFER <name>`
+
+This must be the first directive in a buffer block, and marks it as such. The
+name given is mostly cosmetic, as individual variables can be accessed
+directly using the names given in the corresponding `VAR` directives.
+
+### `STORAGE`
+
+If present, this directive marks the buffer as a (readwrite coherent) shader
+storage block, instead of a readonly uniform buffer block. Such storage blocks
+can be used to track and evolve state across invocations of this shader.
+
+Storage blocks may also be initialized with default data, but this is
+optional. They can also be initialized as part of the first shader execution
+(e.g. by testing for `frame == 0`).
+
+### `VAR <type> <name>`
+
+This directive appends a new variable to the shader block, with GLSL type
+`<type>` and shader name `<name>`. For example, `VAR float foo` introduces a
+`float foo;` member into the buffer block, and `VAR mat4 transform` introduces
+a `mat4 transform;` member.
+
+It is also possible to introduce array variables, using `[N]` as part of the
+variable name.
+
+## Tunable parameters
+
+Finally, the `PARAM` directive allows introducing tunable shader parameters,
+which are exposed programmatically as part of the C API (`pl_hook`).[^mpv]
+
+[^mpv]: In mpv using `--vo=gpu-next`, these can be set using the
+  [`--glsl-shader-opts` option](https://mpv.io/manual/master/#options-glsl-shader-opts).
+
+The default value of a parameter is given as the block body, for example:
+
+``` glsl linenums="1"
+//!PARAM contrast
+//!DESC Gain to apply to image brightness
+//!TYPE float
+//!MINIMUM 0.0
+//!MAXIMUM 100.0
+1.0
+```
+
+Parameters accept the following directives:
+
+### `PARAM <name>`
+
+This must be the first directive in a parameter block, and marks it as such.
+The name given is the name that will be used to refer to this parameter in
+GLSL code.
+
+### `DESC <description>`
+
+This directive can be used to provide a friendlier description of the shader
+parameter, exposed as part of the C API to end users.
+
+### `MINIMUM <value>`, `MAXIMUM <value>`
+
+Provides the minimum/maximum value bound of this parameter. If absent, no
+minimum/maximum is enforced.
+
+### `TYPE [ENUM] <DEFINE | [DYNAMIC | CONSTANT] <type>>`
+
+This gives the type of the parameter, which determines what type of values it
+can hold and how it will be made available to the shader. `<type>` must be
+a scalar GLSL numeric type, such as `int`, `float` or `uint`.
+
+If a type is `ENUM`, it is treated as an enumeration type. To use this, `type`
+must either be `int` or `DEFINE`. Instead of providing a single default value,
+the param body should be a list of all possible enumeration values (as separate
+lines). These names will be made available inside the shader body (as a
+`#define`), as well as inside RPN expressions (e.g. `WHEN`). The qualifiers
+`MINIMUM` and `MAXIMUM` are ignored for `ENUM` parameters, with the value
+range instead being set implicitly from the list of options.
+
+The optional qualifiers `DYNAMIC` or `CONSTANT` mark the parameter as
+dynamically changing and compile-time constant, respectively. A `DYNAMIC`
+variable is assumed to change frequently, and will be grouped with other
+frequently-changing input parameters. A `CONSTANT` parameter will be
+introduced as a compile-time constant into the shader header, which means thy
+can be used in e.g. constant expressions such as array sizes.[^spec]
+
+[^spec]: On supported platforms, these are implemented using specialization
+  constants, which can be updated at run-time without requiring a full shader
+  recompilation.
+
+Finally, the special type `TYPE DEFINE` marks a variable as a preprocessor
+define, which can be used inside `#if` preprocessor expressions. For example:
+
+``` glsl linenums="1"
+//!PARAM taps
+//!DESC Smoothing taps
+//!TYPE DEFINE
+//!MINIMUM 0
+//!MAXIMUM 5
+2
+
+//!HOOK LUMA
+//!BIND HOOKED
+const uint row_size = 2 * taps + 1;
+const float weights[row_size] = {
+#if taps == 0
+    1.0,
+#endif
+
+#if taps == 1
+    0.10650697891920,
+    0.78698604216159,
+    0.10650697891920,
+#endif
+
+#if taps == 2
+    0.05448868454964,
+    0.24420134200323,
+    0.40261994689424,
+    0.24420134200323,
+    0.05448868454964,
+#endif
+
+    // ...
+};
+```
+
+An example of an enum parameter:
+
+``` glsl linenums="1"
+//!PARAM csp
+//!DESC Colorspace
+//!TYPE ENUM int
+BT709
+BT2020
+DCIP3
+
+//!HOOK MAIN
+//!BIND HOOKED
+const mat3 matrices[3] = {
+    mat3(...), // BT709
+    mat3(...), // BT2020
+    mat3(...), // DCIP3
+};
+
+#define MAT matrices[csp]
+// ...
+```
+
+## Full example
+
+A collection of full examples can be found in the [mpv user shaders
+wiki](https://github.com/mpv-player/mpv/wiki/User-Scripts#user-shaders), but
+here is an example of a parametrized Gaussian smoothed film grain compute
+shader:
+
+``` glsl linenums="1"
+//!PARAM intensity
+//!DESC Film grain intensity
+//!TYPE float
+//!MINIMUM 0
+0.1
+
+//!PARAM taps
+//!DESC Film grain smoothing taps
+//!TYPE DEFINE
+//!MINIMUM 0
+//!MAXIMUM 5
+2
+
+//!HOOK LUMA
+//!BIND HOOKED
+//!DESC Apply gaussian smoothed film grain
+//!WHEN intensity 0 >
+//!COMPUTE 32 32
+
+const uint row_size = 2 * taps + 1;
+const float weights[row_size] = {
+#if taps == 0
+    1.0,
+#endif
+
+#if taps == 1
+    0.10650697891920,
+    0.78698604216159,
+    0.10650697891920,
+#endif
+
+#if taps == 2
+    0.05448868454964,
+    0.24420134200323,
+    0.40261994689424,
+    0.24420134200323,
+    0.05448868454964,
+#endif
+
+#if taps == 3
+    0.03663284536919,
+    0.11128075847888,
+    0.21674532140370,
+    0.27068214949642,
+    0.21674532140370,
+    0.11128075847888,
+    0.03663284536919,
+#endif
+
+#if taps == 4
+    0.02763055063889,
+    0.06628224528636,
+    0.12383153680577,
+    0.18017382291138,
+    0.20416368871516,
+    0.18017382291138,
+    0.12383153680577,
+    0.06628224528636,
+    0.02763055063889,
+#endif
+
+#if taps == 5
+    0.02219054849244,
+    0.04558899978527,
+    0.07981140824009,
+    0.11906462996609,
+    0.15136080967773,
+    0.16396720767670,
+    0.15136080967773,
+    0.11906462996609,
+    0.07981140824009,
+    0.04558899978527,
+    0.02219054849244,
+#endif
+};
+
+const uvec2 isize = uvec2(gl_WorkGroupSize) + uvec2(2 * taps);
+shared float grain[isize.y][isize.x];
+
+// PRNG
+float permute(float x)
+{
+    x = (34.0 * x + 1.0) * x;
+    return fract(x * 1.0/289.0) * 289.0;
+}
+
+float seed(uvec2 pos)
+{
+    const float phi = 1.61803398874989;
+    vec3 m = vec3(fract(phi * vec2(pos)), random) + vec3(1.0);
+    return permute(permute(m.x) + m.y) + m.z;
+}
+
+float rand(inout float state)
+{
+    state = permute(state);
+    return fract(state * 1.0/41.0);
+}
+
+// Turns uniform white noise into gaussian white noise by passing it
+// through an approximation of the gaussian quantile function
+float rand_gaussian(inout float state) {
+    const float a0 = 0.151015505647689;
+    const float a1 = -0.5303572634357367;
+    const float a2 = 1.365020122861334;
+    const float b0 = 0.132089632343748;
+    const float b1 = -0.7607324991323768;
+
+    float p = 0.95 * rand(state) + 0.025;
+    float q = p - 0.5;
+    float r = q * q;
+
+    float g = q * (a2 + (a1 * r + a0) / (r*r + b1*r + b0));
+    g *= 0.255121822830526; // normalize to [-1,1)
+    return g;
+}
+
+void hook()
+{
+    // generate grain in `grain`
+    uint num_threads = gl_WorkGroupSize.x * gl_WorkGroupSize.y;
+    for (uint i = gl_LocalInvocationIndex; i < isize.y * isize.x; i += num_threads) {
+        uvec2 pos = uvec2(i % isize.y, i / isize.y);
+        float state = seed(gl_WorkGroupID.xy * gl_WorkGroupSize.xy + pos);
+        grain[pos.y][pos.x] = rand_gaussian(state);
+    }
+
+    // make writes visible
+    barrier();
+
+    // convolve horizontally
+    for (uint y = gl_LocalInvocationID.y; y < isize.y; y += gl_WorkGroupSize.y) {
+        float hsum = 0;
+        for (uint x = 0; x < row_size; x++) {
+            float g = grain[y][gl_LocalInvocationID.x + x];
+            hsum += weights[x] * g;
+        }
+
+        // update grain LUT
+        grain[y][gl_LocalInvocationID.x + taps] = hsum;
+    }
+
+    barrier();
+
+    // convolve vertically
+    float vsum = 0.0;
+    for (uint y = 0; y < row_size; y++) {
+        float g = grain[gl_LocalInvocationID.y + y][gl_LocalInvocationID.x + taps];
+        vsum += weights[y] * g;
+    }
+
+    vec4 color = HOOKED_tex(HOOKED_pos);
+    color.rgb += vec3(intensity * vsum);
+    imageStore(out_image, ivec2(gl_GlobalInvocationID), color);
+}
+```
diff --git a/docs/glsl.md b/docs/glsl.md
new file mode 100644
index 0000000..543e3a4
--- /dev/null
+++ b/docs/glsl.md
@@ -0,0 +1,501 @@
+# GLSL shader system
+
+## Overall design
+
+Shaders in libplacebo are all written in GLSL, and built up incrementally, on
+demand. Generally, all shaders for each frame are generated *per frame*. So
+functions like `pl_shader_color_map` etc. are run anew for every frame. This
+makes the renderer very stateless and allows us to directly embed relevant
+constants, uniforms etc. as part of the same code that generates the actual
+GLSL shader.
+
+To avoid this from becoming wasteful, libplacebo uses an internal string
+building abstraction
+([`pl_str_builder`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/pl_string.h#L263)).
+Rather than building up a string directly, a `pl_str_builder` is like a list of
+string building functions/callbacks to execute in order to generate the actual
+shader. Combined with an efficient `pl_str_builder_hash`, this allows us to
+avoid the bulk of the string templating work for already-cached shaders.
+
+## Legacy API
+
+For the vast majority of libplacebo's history, the main entry-point into the
+shader building mechanism was the `GLSL()` macro ([and
+variants](#shader-sections-glsl-glslh-glslf)), which works like a
+`printf`-append:
+
+```c linenums="1"
+void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "feature extraction");
+    pl_shader_linearize(sh, &csp);
+    GLSL("// pl_shader_extract_features             \n"
+         "{                                         \n"
+         "vec3 lms = %f * "$" * color.rgb;          \n"
+         "lms = pow(max(lms, 0.0), vec3(%f));       \n"
+         "lms = (vec3(%f) + %f * lms)               \n"
+         "        / (vec3(1.0) + %f * lms);         \n"
+         "lms = pow(lms, vec3(%f));                 \n"
+         "float I = dot(vec3(%f, %f, %f), lms);     \n"
+         "color = vec4(I, 0.0, 0.0, 1.0);           \n"
+         "}                                         \n",
+         PL_COLOR_SDR_WHITE / 10000,
+         SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))),
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]);
+}
+```
+
+The special macro `$` is a stand-in for an *identifier* (`ident_t`), which is
+the internal type used to pass references to loaded uniforms, descriptors and
+so on:
+
+```c
+typedef unsigned short ident_t;
+#define $           "_%hx"
+#define NULL_IDENT  0u
+
+// ...
+
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val);
+#define SH_MAT3(val) sh_var_mat3(sh, "mat", val)
+```
+
+In general, constants in libplacebo are divided into three categories:
+
+### Literal shader constants
+
+These are values that are expected to change very infrequently (or never), or
+for which we want to generate a different shader variant per value. Such values
+should be directly formatted as numbers into the shader text: `%d`, `%f` and so
+on. This is commonly used for array sizes, constants that depend only on
+hardware limits, constants that never change (but which have a friendly name,
+like `PQ_C2` above), and so on.
+
+As an example, the debanding iterations weights are hard-coded like this,
+because the debanding shader is expected to change as a result of a different
+number of iterations anyway:
+
+```c linenums="1"
+// For each iteration, compute the average at a given distance and
+// pick it instead of the color if the difference is below the threshold.
+for (int i = 1; i <= params->iterations; i++) {
+    GLSL(// Compute a random angle and distance
+         "d = "$".xy * vec2(%d.0 * "$", %f);    \n" // (1)
+         "d = d.x * vec2(cos(d.y), sin(d.y));   \n"
+         // Sample at quarter-turn intervals around the source pixel
+         "avg = T(0.0);                         \n"
+         "avg += GET(+d.x, +d.y);               \n"
+         "avg += GET(-d.x, +d.y);               \n"
+         "avg += GET(-d.x, -d.y);               \n"
+         "avg += GET(+d.x, -d.y);               \n"
+         "avg *= 0.25;                          \n"
+         // Compare the (normalized) average against the pixel
+         "diff = abs(res - avg);                \n"
+         "bound = T("$" / %d.0);                \n",
+         prng, i, radius, M_PI * 2,
+         threshold, i);
+
+    if (num_comps > 1) {
+        GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
+    } else {
+        GLSL("res = mix(avg, res, diff > bound); \n");
+    }
+}
+```
+
+1.  The `%d.0` here corresponds to the iteration index `i`, while the `%f`
+    corresponds to the fixed constant `M_PI * 2`.
+
+### Specializable shader constants
+
+These are used for tunable parameters that are expected to change infrequently
+during normal playback. These constitute by far the biggest category, and most
+parameters coming from the various `_params` structs should be loaded like
+this.
+
+They are loaded using the `sh_const_*()` functions, which generate a
+specialization constant on supported platforms, falling back to a literal
+shader `#define` otherwise. For anoymous parameters, you can use the
+short-hands `SH_FLOAT`, `SH_INT` etc.:
+
+```c
+ident_t sh_const_int(pl_shader sh, const char *name, int val);
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val);
+ident_t sh_const_float(pl_shader sh, const char *name, float val);
+#define SH_INT(val)     sh_const_int(sh, "const", val)
+#define SH_UINT(val)    sh_const_uint(sh, "const", val)
+#define SH_FLOAT(val)   sh_const_float(sh, "const", val)
+```
+
+Here is an example of them in action:
+
+```c linenums="1"
+void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    params = PL_DEF(params, &pl_sigmoid_default_params);
+    float center = PL_DEF(params->center, 0.75);
+    float slope  = PL_DEF(params->slope, 6.5);
+
+    // This function needs to go through (0,0) and (1,1), so we compute the
+    // values at 1 and 0, and then scale/shift them, respectively.
+    float offset = 1.0 / (1 + expf(slope * center));
+    float scale  = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+    GLSL("// pl_shader_sigmoidize                               \n"
+         "color = clamp(color, 0.0, 1.0);                       \n"
+         "color = vec4("$") - vec4("$") *                       \n"
+         "    log(vec4(1.0) / (color * vec4("$") + vec4("$"))   \n"
+         "        - vec4(1.0));                                 \n",
+         SH_FLOAT(center), SH_FLOAT(1.0 / slope),
+         SH_FLOAT(scale), SH_FLOAT(offset));
+}
+```
+
+The advantage of this type of shader constant is that they will be
+transparently replaced by dynamic uniforms whenever
+`pl_render_params.dynamic_constants` is true, which allows the renderer to
+respond more instantly to changes in the parameters (e.g. as a result of a user
+dragging a slider around). During "normal" playback, they will then be
+"promoted" to actual shader constants to prevent them from taking up registers.
+
+### Dynamic variables
+
+For anything else, e.g. variables which are expected to change very frequently,
+you can use the generic `sh_var()` mechanism, which sends constants either as
+elements of a uniform buffer, or directly as push constants:
+
+```c
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic);
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic);
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic);
+#define SH_INT_DYN(val)   sh_var_int(sh, "const", val, true)
+#define SH_UINT_DYN(val)  sh_var_uint(sh, "const", val, true)
+#define SH_FLOAT_DYN(val) sh_var_float(sh, "const", val, true)
+```
+
+These are used primarily when a variable is expected to change very frequently,
+e.g. as a result of randomness, or for constants which depend on dynamically
+computed, source-dependent variables (e.g. input frame characteristics):
+
+```c linenums="1"
+if (params->show_clipping) {
+    const float eps = 1e-6f;
+    GLSL("bool clip_hi, clip_lo;                            \n"
+         "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n"
+         "clip_lo = any(lessThan(color.rgb, vec3("$")));    \n"
+         "clip_hi = clip_hi || ipt.x > "$";                 \n"
+         "clip_lo = clip_lo || ipt.x < "$";                 \n",
+         SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps),
+         SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps),
+         SH_FLOAT_DYN(tone.input_max + eps),
+         SH_FLOAT(tone.input_min - eps));
+}
+```
+
+### Shader sections (GLSL, GLSLH, GLSLF)
+
+Shader macros come in three main flavors, depending on where the resulting text
+should be formatted:
+
+- `GLSL`: Expanded in the scope of the current `main` function,
+  and is related to code directly processing the current pixel value.
+- `GLSLH`: Printed to the 'header', before the first function, but after
+  variables, uniforms etc. This is used for global definitions, helper
+  functions, shared memory variables, and so on.
+- `GLSLF`: Printed to the `footer`, which is always at the end of the current
+  `main` function, but before returning to the caller / writing to the
+  framebuffer. Used to e.g. update SSBO state in preparation for the next
+  frame.
+
+Finally, there is a fourth category `GLSLP` (prelude), which is currently only
+used internally to generate preambles during e.g. compute shader translation.
+
+## New #pragma GLSL macro
+
+Starting with libplacebo v6, the internal shader system has been augmented by a
+custom macro preprocessor, which is designed to ease the boilerplate of writing
+shaders (and also strip redundant whitespace from generated shaders). The code
+for this is found in the
+[tools/glsl_preproc](https://code.videolan.org/videolan/libplacebo/-/tree/master/tools/glsl_preproc)
+directory.
+
+In a nutshell, this allows us to embed GLSL snippets directly as `#pragma GLSL`
+macros (resp. `#pragma GLSLH`, `#pragma GLSLF`):
+
+```c linenums="1"
+bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+
+    sh_describe(sh, "bicubic");
+#pragma GLSL /* pl_shader_sample_bicubic */         \
+    vec4 color;                                     \
+    {                                               \
+    vec2 pos = $pos;                                \
+    vec2 size = vec2(textureSize($tex, 0));         \
+    vec2 frac  = fract(pos * size + vec2(0.5));     \
+    vec2 frac2 = frac * frac;                       \
+    vec2 inv   = vec2(1.0) - frac;                  \
+    vec2 inv2  = inv * inv;                         \
+    /* compute basis spline */                      \
+    vec2 w0 = 1.0/6.0 * inv2 * inv;                 \
+    vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+    vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);  \
+    vec2 w3 = 1.0/6.0 * frac2 * frac;               \
+    vec4 g = vec4(w0 + w1, w2 + w3);                \
+    vec4 h = vec4(w1, w3) / g + inv.xyxy;           \
+    h.xy -= vec2(2.0);                              \
+    /* sample four corners, then interpolate */     \
+    vec4 p = pos.xyxy + $pt.xyxy * h;               \
+    vec4 c00 = textureLod($tex, p.xy, 0.0);         \
+    vec4 c01 = textureLod($tex, p.xw, 0.0);         \
+    vec4 c0 = mix(c01, c00, g.y);                   \
+    vec4 c10 = textureLod($tex, p.zy, 0.0);         \
+    vec4 c11 = textureLod($tex, p.zw, 0.0);         \
+    vec4 c1 = mix(c11, c10, g.y);                   \
+    color = ${float:scale} * mix(c1, c0, g.x);      \
+    }
+
+    return true;
+}
+```
+
+This gets transformed, by the GLSL macro preprocessor, into an optimized shader
+template invocation like the following:
+
+```c linenums="1"
+{
+    // ...
+    sh_describe(sh, "bicubic");
+    const struct __attribute__((__packed__)) {
+        ident_t pos;
+        ident_t tex;
+        ident_t pt;
+        ident_t scale;
+    } _glsl_330_args = {
+        .pos = pos,
+        .tex = tex,
+        .pt = pt,
+        .scale = sh_const_float(sh, "scale", scale),
+    };
+    size_t _glsl_330_fn(void *, pl_str *, const uint8_t *);
+    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_330_fn,
+                          &_glsl_330_args, sizeof(_glsl_330_args));
+    // ...
+}
+
+size_t _glsl_330_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
+{
+    struct __attribute__((__packed__)) {
+        ident_t pos;
+        ident_t tex;
+        ident_t pt;
+        ident_t scale;
+    } vars;
+    memcpy(&vars, ptr, sizeof(vars));
+
+    pl_str_append_asprintf_c(alloc, buf,
+        "/* pl_shader_sample_bicubic */\n"
+        "    vec4 color;\n"
+        "    {\n"
+        "    vec2 pos = /*pos*/_%hx;\n"
+        "    vec2 size = vec2(textureSize(/*tex*/_%hx, 0));\n"
+        "    vec2 frac  = fract(pos * size + vec2(0.5));\n"
+        "    vec2 frac2 = frac * frac;\n"
+        "    vec2 inv   = vec2(1.0) - frac;\n"
+        "    vec2 inv2  = inv * inv;\n"
+        "    /* compute basis spline */\n"
+        "    vec2 w0 = 1.0/6.0 * inv2 * inv;\n"
+        "    vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);\n"
+        "    vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);\n"
+        "    vec2 w3 = 1.0/6.0 * frac2 * frac;\n"
+        "    vec4 g = vec4(w0 + w1, w2 + w3);\n"
+        "    vec4 h = vec4(w1, w3) / g + inv.xyxy;\n"
+        "    h.xy -= vec2(2.0);\n"
+        "    /* sample four corners, then interpolate */\n"
+        "    vec4 p = pos.xyxy + /*pt*/_%hx.xyxy * h;\n"
+        "    vec4 c00 = textureLod(/*tex*/_%hx, p.xy, 0.0);\n"
+        "    vec4 c01 = textureLod(/*tex*/_%hx, p.xw, 0.0);\n"
+        "    vec4 c0 = mix(c01, c00, g.y);\n"
+        "    vec4 c10 = textureLod(/*tex*/_%hx, p.zy, 0.0);\n"
+        "    vec4 c11 = textureLod(/*tex*/_%hx, p.zw, 0.0);\n"
+        "    vec4 c1 = mix(c11, c10, g.y);\n"
+        "    color = /*scale*/_%hx * mix(c1, c0, g.x);\n"
+        "    }\n",
+        vars.pos,
+        vars.tex,
+        vars.pt,
+        vars.tex,
+        vars.tex,
+        vars.tex,
+        vars.tex,
+        vars.scale
+    );
+
+    return sizeof(vars);
+}
+```
+
+To support this style of shader programming, special syntax was invented:
+
+### Shader variables
+
+Instead of being formatted with `"$"`, `%f` etc. and supplied in a big list,
+printf style, GLSL macros may directly embed shader variables:
+
+```c
+ident_t pos, tex = sh_bind(sh, texture, ..., &pos, ...);
+#pragma GLSL vec4 color = texture($tex, $pos);
+```
+
+The simplest possible shader variable is just `$name`, which corresponds to
+any variable of type `ident_t`. More complicated expression are also possible:
+
+```glsl
+#define RAND3 ${sh_prng(sh, false, NULL)}
+color.rgb += ${float:params->noise} * RAND3;
+```
+
+In the expression `${float:params->noise}`, the `float:` prefix here transforms
+the shader variable into the equivalent of `SH_FLOAT()` in the legacy API,
+that is, a generic float (specialization) constant. Other possible types are:
+
+```glsl
+TYPE  i = ${ident: sh_desc(...)};
+float f = ${float: M_PI};
+int   i = ${int:   params->width};
+uint  u = ${uint:  sizeof(ssbo)};
+```
+
+In addition to a type specifier, the optional qualifiers `dynamic` and `const`
+will modify the variable, turning it into (respectively) a dynamically loaded
+uniform (`SH_FLOAT_DYN` etc.), or a hard-coded shader literal (`%d`, `%f`
+etc.):
+
+```glsl
+const float base = ${const float: M_LOG10E};
+int seed = ${dynamic int: rand()};
+```
+
+For sampling from component masks, the special types `swizzle` and
+`(u|i)vecType` can be used to generate the appropriate texture swizzle and
+corresponding vector type:
+
+```glsl
+${vecType: comp_mask} tmp = color.${swizzle: comp_mask};
+```
+
+### Macro directives
+
+Lines beginning with `@` are not included in the GLSL as-is, but instead parsed
+as macro directives, to control the code flow inside the macro expansion:
+
+#### @if / @else
+
+Standard-purpose conditional. Example:
+
+```glsl
+float alpha = ...;
+@if (repr.alpha == PL_ALPHA_INDEPENDENT)
+    color.a *= alpha;
+@else
+    color.rgba *= alpha;
+```
+
+The condition is evaluated outside the macro (in the enclosing scope) and
+the resulting boolean variable is directly passed to the template.
+
+An `@if` block can also enclose multiple lines:
+
+```glsl
+@if (threshold > 0) {
+    float thresh = ${float:threshold};
+    coeff = mix(coeff, vec2(0.0), lessThan(coeff, vec2(thresh)));
+    coeff = mix(coeff, vec2(1.0), greaterThan(coeff, vec2(1.0 - thresh)));
+@}
+```
+
+#### @for
+
+This can be used to generate (unrolled) loops:
+
+```glsl
+int offset = ${const int: params->kernel_width / 2};
+float sum = 0.0;
+@for (x < params->kernel_width)
+    sum += textureLodOffset($luma, $pos, 0.0, int(@sum - offset)).r;
+```
+
+This introduces a local variable, `@x`, which expands to an integer containing
+the current loop index. Loop indices always start at 0. Valid terminating
+conditions include `<` and `<=`, and the loop stop condition is also evaluated
+as an integer.
+
+Alternatively, this can be used to iterate over a bitmask (as commonly used for
+e.g. components in a color mask):
+
+```glsl
+float weight = /* ... */;
+vec4 color = textureLod($tex, $pos, 0.0);
+@for (c : params->component_mask)
+    sum[@c] += weight * color[@c];
+```
+
+Finally, to combine loops with conditionals, the special syntax `@if @(cond)`
+may be used to evaluate expressions inside the template loop:
+
+```glsl
+@for (i < 10) {
+    float weight = /* ... */;
+    @if @(i < 5)
+        weight = -weight;
+    sum += weight * texture(...);
+@}
+```
+
+In this case, the `@if` conditional may only reference local (loop) variables.
+
+#### @switch / @case
+
+This corresponds fairly straightforwardly to a normal switch/case from C:
+
+```glsl
+@switch (color->transfer) {
+@case PL_COLOR_TRC_SRGB:
+    color.rgb = mix(color.rgb * 1.0/12.92,
+                    pow((color.rgb + vec3(0.055)) / 1.055, vec3(2.4)),
+                    lessThan(vec3(0.04045), color.rgb));
+    @break;
+@case PL_COLOR_TRC_GAMMA18:
+    color.rgb = pow(color.rgb, vec3(1.8));
+    @break;
+@case PL_COLOR_TRC_GAMMA20:
+    color.rgb = pow(color.rgb, vec3(2.0));
+    @break;
+@case PL_COLOR_TRC_GAMMA22:
+    color.rgb = pow(color.rgb, vec3(2.2));
+    @break;
+/* ... */
+@}
+```
+
+The switch body is always evaluated as an `unsigned int`.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..9122afe
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,36 @@
+# Introduction
+
+## Overview
+
+This document will serve as an introduction to and usage example for the
+[libplacebo](https://code.videolan.org/videolan/libplacebo) API. This is not
+intended as a full API reference, for that you should see the repository of
+[header
+files](https://code.videolan.org/videolan/libplacebo/-/tree/master/src/include/libplacebo),
+which are written to be (hopefully) understandable as-is.
+
+libplacebo exposes large parts of its internal abstractions publicly. This
+guide will take the general approach of starting as high level as possible and
+diving into the details in later chapters.
+
+A full listing of currently available APIs and their corresponding header
+files can be seen
+[here](https://code.videolan.org/videolan/libplacebo#api-overview).
+
+## Getting Started
+
+To get started using libplacebo, you need to install it (and its development
+headers) somehow onto your system. On most distributions, this should be as
+simple as installing the corresponding `libplacebo-devel` package, or the
+appropriate variants.
+
+You can see a fill list of libplacebo packages and their names [on
+repology](https://repology.org/project/libplacebo/versions).
+
+!!! note "API versions"
+
+    This document is targeting the "v4 API" overhaul, and as such, examples
+    provided will generally fail to compile on libplacebo versions below v4.x.
+
+Alternatively, you can install it from the source code. For that, see the
+build instructions [located here](https://code.videolan.org/videolan/libplacebo#installing).
diff --git a/docs/options.md b/docs/options.md
new file mode 100644
index 0000000..decba48
--- /dev/null
+++ b/docs/options.md
@@ -0,0 +1,978 @@
+# Options
+
+The following provides an overview of all options available via the built-in
+`pl_options` system.
+
+## Global preset
+
+### `preset=<default|fast|high_quality>`
+
+Override all options from all sections by the values from the given
+preset. The following presets are available:
+
+- `default`: Default settings, tuned to provide a balance of performance and
+  quality. Should be fine on almost all systems.
+- `fast`: Disable all advanced rendering, equivalent to passing `no` to every
+  option. Increases performance on very slow / old integrated GPUs.
+- `high_quality`: Reset all structs to their `high_quality` presets (where
+  available), set the upscaler to `ewa_lanczossharp`, and enable `deband=yes`.
+  Suitable for use on machines with a discrete GPU.
+
+## Scaling
+
+### `upscaler=<filter>`
+
+Sets the filter used for upscaling. Defaults to `lanczos`. Pass `upscaler=help`
+to see a full list of filters. The most relevant options, roughly ordered from
+fastest to slowest:
+
+- `none`: No filter, only use basic GPU texture sampling
+- `nearest`: Nearest-neighbour (box) sampling (very fast)
+- `bilinear`: Bilinear sampling (very fast)
+- `oversample`: Aspect-ratio preserving nearest neighbour sampling (very fast)
+- `bicubic`: Bicubic interpolation (fast)
+- `gaussian`: Gaussian smoothing (fast)
+- `catmull_rom`: Catmull-Rom cubic spline
+- `lanczos`: Lanczos reconstruction
+- `ewa_lanczos`: EWA Lanczos ("Jinc") reconstruction (slow)
+- `ewa_lanczossharp`: Sharpened version of `ewa_lanczos` (slow)
+- `ewa_lanczos4sharpest`: Very sharp version of `ewa_lanczos`, with
+  anti-ringing (very slow)
+
+### `downscaler=<filter>`
+
+Sets the filter used for downscaling. Defaults to `hermite`. Pass
+`downscaler=help` to see a full list of filters. The most relevant options,
+roughly ordered from fastest to slowest:
+
+- `none`: Use the same filter as specified for `upscaler`
+- `box`: Box averaging (very fast)
+- `hermite`: Hermite-weighted averaging (fast)
+- `bilinear`: Bilinear (triangle) averaging (fast)
+- `bicubic`: Bicubic interpolation (fast)
+- `gaussian`: Gaussian smoothing (fast)
+- `catmull_rom`: Catmull-Rom cubic spline
+- `mitchell`: Mitchell-Netravalia cubic spline
+- `lanczos`: Lanczos reconstruction
+
+### `plane_upscaler=<filter>`, `plane_downscaler=<filter>`
+
+Override the filter used for upscaling/downscaling planes, e.g. chroma/alpha.
+If set to `none`, use the same setting as `upscaler` and `downscaler`,
+respectively. Defaults to `none` for both.
+
+### `frame_mixer=<filter>`
+
+Sets the filter used for frame mixing (temporal interpolation). Defaults to
+`oversample`. Pass `frame_mixer=help` to see a full list of filters. The most
+relevant options, roughly ordered from fastest to slowest:
+
+- `none`: Disable frame mixing, show nearest frame to target PTS
+- `oversample`: Oversampling, only mix "edge" frames while preserving FPS
+- `hermite`: Hermite-weighted frame mixing
+- `linear`: Linear frame mixing
+- `cubic`: Cubic B-spline frame mixing
+
+### `antiringing_strength=<0.0..1.0>`
+
+Antiringing strength to use for all filters. A value of `0.0` disables
+antiringing, and a value of `1.0` enables full-strength antiringing. Defaults
+to `0.0`.
+
+!!! note
+    Specific filter presets may override this option.
+
+### Custom scalers
+
+Custom filter kernels can be created by setting the filter to `custom`, in
+addition to setting the respective options, replacing `<scaler>` by the
+corresponding scaler (`upscaler`, `downscaler`, etc.)
+
+#### `<scaler>_preset=<filter>`
+
+Overrides the value of all options in this section by their default values from
+the given filter preset.
+
+#### `<scaler>_kernel=<kernel>`, `<scaler>_window=<kernel>`
+
+Choose the filter kernel and window function, rspectively. Pass `help` to
+get a full list of filter kernels. Defaults to `none`.
+
+#### `<scaler>_radius=<0.0..16.0>`
+
+Override the filter kernel radius. Has no effect if the filter kernel
+is not resizeable. Defaults to `0.0`, meaning "no override".
+
+#### `<scaler>_clamp=<0.0..1.0>`
+
+Represents an extra weighting/clamping coefficient for negative weights. A
+value of `0.0` represents no clamping. A value of `1.0` represents full
+clamping, i.e. all negative lobes will be removed. Defaults to `0.0`.
+
+#### `<scaler>_blur=<0.0..100.0>`
+
+Additional blur coefficient. This effectively stretches the kernel, without
+changing the effective radius of the filter radius. Setting this to a value of
+`0.0` is equivalent to disabling it. Values significantly below `1.0` may
+seriously degrade the visual output, and should be used with care. Defaults to
+`0.0`.
+
+#### `<scaler>_taper=<0.0..1.0>`
+
+Additional taper coefficient. This essentially flattens the function's center.
+The values within `[-taper, taper]` will return `1.0`, with the actual function
+being squished into the remainder of `[taper, radius]`. Defaults to `0.0`.
+
+#### `<scaler>_antiring=<0.0..1.0>`
+
+Antiringing override for this filter. Defaults to `0.0`, which infers the value
+from `antiringing_strength`.
+
+#### `<scaler>_param1`, `<scaler>_param2` `<scaler>_wparam1`, `<scaler>_wparam2`
+
+Parameters for the respective filter function. Ignored if not tunable. Defaults
+to `0.0`.
+
+#### `<scaler>_polar=<yes|no>`
+
+If true, this filter is a polar/2D filter (EWA), instead of a separable/1D
+(orthogonal) filter. Defaults to `no`.
+
+## Debanding
+
+These options control the optional debanding step. Debanding can be used to
+reduce the prevalence of quantization artefacts in low quality sources, but
+can be heavy to compute on weaker devices.
+
+!!! note
+    This can also be used as a pure grain generator, by setting
+    `deband_iterations=0`.
+
+### `deband=<yes|no>`
+
+Enables debanding. Defaults to `no`.
+
+### `deband_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `deband_iterations=<0..16>`
+
+The number of debanding steps to perform per sample. Each
+step reduces a bit more banding, but takes time to compute.
+Note that the strength of each step falls off very quickly,
+so high numbers (>4) are practically useless. Defaults to `1`.
+
+### `deband_threshold=<0.0..1000.0>`
+
+The debanding filter's cut-off threshold. Higher numbers
+increase the debanding strength dramatically, but
+progressively diminish image details. Defaults to `3.0`.
+
+### `deband_radius=<0.0..1000.0>`
+
+The debanding filter's initial radius. The radius increases
+linearly for each iteration. A higher radius will find more
+gradients, but a lower radius will smooth more aggressively.
+Defaults to `16.0`.
+
+### `deband_grain=<0.0..1000.0>`
+
+Add some extra noise to the image. This significantly helps
+cover up remaining quantization artifacts. Higher numbers add
+more noise. Defaults to `4.0`, which is very mild.
+
+### `deband_grain_neutral_r, deband_grain_neutral_g, deband_grain_neutral_b`
+
+'Neutral' grain value for each channel being debanded. Grain
+application will be modulated to avoid disturbing colors
+close to this value. Set this to a value corresponding to
+black in the relevant colorspace.
+
+!!! note
+    This is done automatically by `pl_renderer` and should not need to be
+    touched by the user. This is purely a debug option.
+
+## Sigmoidization
+
+These options control the sigmoidization parameters. Sigmoidization is an
+optional step during upscaling which reduces the prominence of ringing
+artifacts.
+
+### `sigmoid=<yes|no>`
+
+Enables sigmoidization. Defaults to `yes`.
+
+### `sigmoid_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `sigmoid_center=<0.0..1.0>`
+
+The center (bias) of the sigmoid curve. Defaults to `0.75`.
+
+### `sigmoid_slope=<1.0..20.0>`
+
+The slope (steepness) of the sigmoid curve. Defaults to `6.5`.
+
+## Color adjustment
+
+These options affect the decoding of the source color values, and can be used
+to subjectively alter the appearance of the video.
+
+### `color_adjustment=<yes|no>`
+
+Enables color adjustment. Defaults to `yes`.
+
+### `color_adjustment_preset=<neutral>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `brightness=<-1.0..1.0>`
+
+Brightness boost. Adds a constant bias onto the source
+luminance signal. `0.0` = neutral, `1.0` = solid white,
+`-1.0` = solid black. Defaults to `0.0`.
+
+### `contrast=<0.0..100.0>`
+
+Contrast gain. Multiplies the source luminance signal by a
+constant factor. `1.0` = neutral, `0.0` = solid black.
+Defaults to `1.0`.
+
+### `saturation=<0.0..100.0>`
+
+Saturation gain. Multiplies the source chromaticity signal by
+a constant factor. `1.0` = neutral, `0.0` = grayscale.
+Defaults to `1.0`.
+
+### `hue=<angle>`
+
+Hue shift. Corresponds to a rotation of the UV subvector
+around the neutral axis. Specified in radians. Defaults to
+`0.0` (neutral).
+
+### `gamma=<0.0..100.0>`
+
+Gamma lift. Subjectively brightnes or darkens the scene while
+preserving overall contrast. `1.0` = neutral, `0.0` = solid
+black. Defaults to `1.0`.
+
+### `temperature=<-1.143..5.286>`
+
+Color temperature shift. Relative to 6500 K, a value of `0.0` gives you 6500 K
+(no change), a value of `-1.0` gives you 3000 K, and a value of `1.0` gives you
+10000 K. Defaults to `0.0`.
+
+## HDR peak detection
+
+These options affect the HDR peak detection step. This can be used to greatly
+improve the HDR tone-mapping process in the absence of dynamic video metadata,
+but may be prohibitively slow on some devices (e.g. weaker integrated GPUs).
+
+### `peak_detect=<yes|no>`
+
+Enables HDR peak detection. Defaults to `yes`.
+
+### `peak_detection_preset=<default|high_quality>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. `high_quality` also enables frame histogram measurement.
+
+### `peak_smoothing_period=<0.0..1000.0>`
+
+Smoothing coefficient for the detected values. This controls the time parameter
+(tau) of an IIR low pass filter. In other words, it represent the cutoff period
+(= 1 / cutoff frequency) in frames. Frequencies below this length will be
+suppressed. This helps block out annoying "sparkling" or "flickering" due to
+small variations in frame-to-frame brightness. If left as `0.0`, this smoothing
+is completely disabled. Defaults to `20.0`.
+
+### `scene_threshold_low=<0.0..100.0>`, `scene_threshold_high=<0.0..100.0>`
+
+In order to avoid reacting sluggishly on scene changes as a result of the
+low-pass filter, we disable it when the difference between the current frame
+brightness and the average frame brightness exceeds a given threshold
+difference. But rather than a single hard cutoff, which would lead to weird
+discontinuities on fades, we gradually disable it over a small window of
+brightness ranges. These parameters control the lower and upper bounds of this
+window, in units of 1% PQ.
+
+Setting either one of these to 0.0 disables this logic. Defaults to `1.0` and
+`3.0`, respectively.
+
+### `peak_percentile=<0.0..100.0>`
+
+Which percentile of the input image brightness histogram to consider as the
+true peak of the scene. If this is set to `100` (or `0`), the brightest pixel
+is measured. Otherwise, the top of the frequency distribution is progressively
+cut off. Setting this too low will cause clipping of very bright details, but
+can improve the dynamic brightness range of scenes with very bright isolated
+highlights.
+
+Defaults to `100.0`. The `high_quality` preset instead sets this to `99.995`,
+which is very conservative and should cause no major issues in typical content.
+
+### `allow_delayed_peak=<yes|no>`
+
+Allows the peak detection result to be delayed by up to a single frame, which
+can sometimes improve thoughput, at the cost of introducing the possibility of
+1-frame flickers on transitions. Defaults to `no`.
+
+## Color mapping
+
+These options affect the way colors are transformed between color spaces,
+including tone- and gamut-mapping where needed.
+
+### `color_map=<yes|no>`
+
+Enables the use of these color mapping settings. Defaults to `yes`.
+
+!!! note
+    Disabling this option does *not* disable color mapping, it just means "use
+    the default options for everything".
+
+### `color_map_preset=<default|high_quality>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. `high_quality` also enables HDR contrast recovery.
+
+### `gamut_mapping=<function>`
+
+Gamut mapping function to use to handle out-of-gamut colors, including colors
+which are out-of-gamut as a consequence of tone mapping. Defaults to
+`perceptual`. The following options are available:
+
+- `clip`: Performs no gamut-mapping, just hard clips out-of-range colors
+  per-channel.
+- `perceptual`: Performs a perceptually balanced (saturation) gamut mapping,
+  using a soft knee function to preserve in-gamut colors, followed by a final
+  softclip operation. This works bidirectionally, meaning it can both compress
+  and expand the gamut. Behaves similar to a blend of `saturation` and
+  `softclip`.
+- `softclip`: Performs a perceptually balanced gamut mapping using a soft knee
+  function to roll-off clipped regions, and a hue shifting function to preserve
+  saturation.
+- `relative`: Performs relative colorimetric clipping, while maintaining an
+  exponential relationship between brightness and chromaticity.
+- `saturation`: Performs simple RGB->RGB saturation mapping. The input R/G/B
+  channels are mapped directly onto the output R/G/B channels. Will never clip,
+  but will distort all hues and/or result in a faded look.
+- `absolute`: Performs absolute colorimetric clipping. Like `relative`, but
+  does not adapt the white point.
+- `desaturate`: Performs constant-luminance colorimetric clipping, desaturing
+  colors towards white until they're in-range.
+- `darken`: Uniformly darkens the input slightly to prevent clipping on
+  blown-out highlights, then clamps colorimetrically to the input gamut
+  boundary, biased slightly to preserve chromaticity over luminance.
+- `highlight`: Performs no gamut mapping, but simply highlights out-of-gamut
+  pixels.
+- `linear`: Linearly/uniformly desaturates the image in order to bring the
+  entire image into the target gamut.
+
+### Gamut mapping constants
+
+These settings can be used to fine-tune the constants used for the various
+gamut mapping algorithms.
+
+#### `perceptual_deadzone=<0.0..1.0>`
+
+(Relative) chromaticity protection zone for `perceptual` mapping. Defaults to
+`0.30`.
+
+#### `perceptual_strength=<0.0..1.0>`
+
+Strength of the `perceptual` saturation mapping component. Defaults to `0.80`.
+
+#### `colorimetric_gamma=<0.0..10.0>`
+
+I vs C curve gamma to use for colorimetric clipping (`relative`, `absolute`
+and `darken`). Defaults to `1.80`.
+
+#### `softclip_knee=<0.0..1.0>`
+
+Knee point to use for soft-clipping methods (`perceptual`, `softclip`).
+Defaults to `0.70`.
+
+#### `softclip_desat=<0.0..1.0>`
+
+Desaturation strength for `softclip`. Defaults to `0.35`.
+
+### `lut3d_size_I=<0..1024>`, `lut3d_size_C=<0..1024>`, `lut3d_size_h=<0..1024>`
+
+Gamut mapping 3DLUT size. Setting a dimension to `0` picks the default value.
+Defaults to `48`, `32` and `256`, respectively, for channels `I`, `C` and `h`.
+
+### `lut3d_tricubic=<yes|no>`
+
+Use higher quality, but slower, tricubic interpolation for gamut mapping
+3DLUTs. May substantially improve the 3DLUT gamut mapping accuracy, in
+particular at smaller 3DLUT sizes. Shouldn't have much effect at the default
+size. Defaults to `no`.
+
+### `gamut_expansion=<yes|no>`
+
+If enabled, allows the gamut mapping function to expand the gamut, in cases
+where the target gamut exceeds that of the source. If disabled, the source
+gamut will never be enlarged, even when using a gamut mapping function capable
+of bidirectional mapping. Defaults to `no`.
+
+### `tone_mapping=<function>`
+
+Tone mapping function to use for adapting between difference luminance ranges,
+including black point adaptation. Defaults to `spline`. The following functions
+are available:
+
+- `clip`: Performs no tone-mapping, just clips out-of-range colors. Retains
+  perfect color accuracy for in-range colors but completely destroys
+  out-of-range information. Does not perform any black point adaptation.
+- `spline`: Simple spline consisting of two polynomials, joined by a single
+  pivot point, which is tuned based on the source scene average brightness
+  (taking into account dynamic metadata if available). This function can be
+  used for both forward and inverse tone mapping.
+- `st2094-40`: EETF from SMPTE ST 2094-40 Annex B, which uses the provided OOTF
+  based on Bezier curves to perform tone-mapping. The OOTF used is adjusted
+  based on the ratio between the targeted and actual display peak luminances.
+  In the absence of HDR10+ metadata, falls back to a simple constant bezier
+  curve.
+- `st2094-10`: EETF from SMPTE ST 2094-10 Annex B.2, which takes into account
+  the input signal average luminance in addition to the maximum/minimum.
+!!! warning
+    This does *not* currently include the subjective gain/offset/gamma controls
+    defined in Annex B.3. (Open an issue with a valid sample file if you want
+    such parameters to be respected.)
+- `bt2390`: EETF from the ITU-R Report BT.2390, a hermite spline roll-off with
+  linear segment.
+- `bt2446a`: EETF from ITU-R Report BT.2446, method A. Can be used for both
+  forward and inverse tone mapping.
+- `reinhard:` Very simple non-linear curve. Named after Erik Reinhard.
+- `mobius`: Generalization of the `reinhard` tone mapping algorithm to support
+  an additional linear slope near black. The name is derived from its function
+  shape `(ax+b)/(cx+d)`, which is known as a Möbius transformation. This
+  function is considered legacy/low-quality, and should not be used.
+- `hable`: Piece-wise, filmic tone-mapping algorithm developed by John Hable
+  for use in Uncharted 2, inspired by a similar tone-mapping algorithm used by
+  Kodak. Popularized by its use in video games with HDR rendering. Preserves
+  both dark and bright details very well, but comes with the drawback of
+  changing the average brightness quite significantly. This is sort of similar
+  to `reinhard` with `reinhard_contrast=0.24`. This function is considered
+  legacy/low-quality, and should not be used.
+- `gamma`: Fits a gamma (power) function to transfer between the source and
+  target color spaces, effectively resulting in a perceptual hard-knee joining
+  two roughly linear sections. This preserves details at all scales, but can
+  result in an image with a muted or dull appearance. This function
+  is considered legacy/low-quality and should not be used.
+- `linear`: Linearly stretches the input range to the output range, in PQ
+  space. This will preserve all details accurately, but results in a
+  significantly different average brightness. Can be used for inverse
+  tone-mapping in addition to regular tone-mapping.
+- `linearlight`: Like `linear`, but in linear light (instead of PQ). Works well
+  for small range adjustments but may cause severe darkening when
+  downconverting from e.g. 10k nits to SDR.
+
+### Tone-mapping constants
+
+These settings can be used to fine-tune the constants used for the various
+tone mapping algorithms.
+
+#### `knee_adaptation=<0.0..1.0>`
+
+Configures the knee point, as a ratio between the source average and target
+average (in PQ space). An adaptation of `1.0` always adapts the source scene
+average brightness to the (scaled) target average, while a value of `0.0` never
+modifies scene brightness.
+
+Affects all methods that use the ST2094 knee point determination (currently
+`spline`, `st2094-40` and `st2094-10`). Defaults to `0.4`.
+
+#### `knee_minimum=<0.0..0.5>`, `knee_maximum=<0.5..1.0>`
+
+Configures the knee point minimum and maximum, respectively, as a percentage of
+the PQ luminance range. Provides a hard limit on the knee point chosen by
+`knee_adaptation`. Defaults to `0.1` and `0.8`, respectively.
+
+#### `knee_default=<0.0..1.0>`
+
+Default knee point to use in the absence of source scene average metadata.
+Normally, this is ignored in favor of picking the knee point as the (relative)
+source scene average brightness level. Defaults to `0.4`.
+
+#### `knee_offset=<0.5..2.0>`
+
+Knee point offset (for `bt2390` only). Note that a value of `0.5` is the
+spec-defined default behavior, which differs from the libplacebo default of
+`1.0`.
+
+#### `slope_tuning=<0.0..10.0>`, `slope_offset=<0.0..1.0>`
+
+For the single-pivot polynomial (spline) function, this controls the
+coefficients used to tune the slope of the curve. This tuning is designed to
+make the slope closer to `1.0` when the difference in peaks is low, and closer
+to linear when the difference between peaks is high. Defaults to `1.5`, with
+offset `0.2`.
+
+#### `spline_contrast=<0.0..1.5>`
+
+Contrast setting for the `spline` function. Higher values make the curve
+steeper (closer to `clip`), preserving midtones at the cost of losing
+shadow/highlight details, while lower values make the curve shallowed (closer
+to `linear`), preserving highlights at the cost of losing midtone contrast.
+Values above `1.0` are possible, resulting in an output with more contrast than
+the input. Defaults to `0.5`.
+
+#### `reinhard_contrast=<0.0..1.0>`
+
+For the `reinhard` function, this specifies the local contrast coefficient at
+the display peak. Essentially, a value of `0.5` implies that the reference
+white will be about half as bright as when clipping. Defaults to `0.5`.
+
+#### `linear_knee=<0.0..1.0>`
+
+For legacy functions (`mobius`, `gamma`) which operate on linear light, this
+directly sets the corresponding knee point. Defaults to `0.3`.
+
+#### `exposure=<0.0..10.0>`
+
+For linear methods (`linear`, `linearlight`), this controls the linear
+exposure/gain applied to the image. Defaults to `1.0`.
+
+### `inverse_tone_mapping=<yes|no>`
+
+If enabled, and supported by the given tone mapping function, will perform
+inverse tone mapping to expand the dynamic range of a signal. libplacebo is not
+liable for any HDR-induced eye damage. Defaults to `no`.
+
+### `tone_map_metadata=<any|none|hdr10|hdr10plus|cie_y>`
+
+Data source to use when tone-mapping. Setting this to a specific value allows
+overriding the default metadata preference logic. Defaults to `any`.
+
+### `tone_lut_size=<0..4096>`
+
+Tone mapping LUT size. Setting `0` picks the default size. Defaults to `256`.
+
+### `contrast_recovery=<0.0..2.0>`
+
+HDR contrast recovery strength. If set to a value above `0.0`, the source image
+will be divided into high-frequency and low-frequency components, and a portion
+of the high-frequency image is added back onto the tone-mapped output. May
+cause excessive ringing artifacts for some HDR sources, but can improve the
+subjective sharpness and detail left over in the image after tone-mapping.
+
+Defaults to `0.0`. The `high_quality` preset sets this to `0.3`, which is a
+fairly conservativee value and should subtly enhance the image quality without
+creating too many obvious artefacts.
+
+### `contrast_smoothness=<1.0..32.0>`
+
+HDR contrast recovery lowpass kernel size. Increasing or decreasing this will
+affect the visual appearance substantially. Defaults to `3.5`.
+
+### Debug options
+
+Miscellaneous debugging and display options related to tone/gamut mapping.
+
+#### `force_tone_mapping_lut=<yes|no>`
+
+Force the use of a full tone-mapping LUT even for functions that have faster
+pure GLSL replacements (e.g. `clip`, `linear`, `saturation`). This is a debug
+option. Defaults to `no`.
+
+#### `visualize_lut=<yes|no>`
+
+Visualize the color mapping LUTs. Displays a (PQ-PQ) graph of the active
+tone-mapping LUT. The X axis shows PQ input values, the Y axis shows PQ output
+values. The tone-mapping curve is shown in green/yellow. Yellow means the
+brightness has been boosted from the source, dark blue regions show where the
+brightness has been reduced. The extra colored regions and lines indicate
+various monitor limits, as well a reference diagonal (neutral tone-mapping) and
+source scene average brightness information (if available). The background
+behind this shows a visualization of the gamut mapping 3DLUT, in IPT space.
+Iso-luminance, iso-chromaticity and iso-hue lines are highlighted (depending on
+the exact value of `visualize_theta`). Defaults to `no`.
+
+#### `visualize_lut_x0`, `visualize_lut_y0`, `visualize_lut_x0`, `visualize_lut_y1`
+
+Controls where to draw the LUt visualization, relative to the rendered video.
+Defaults to `0.0` for `x0`/`y0`, and `1.0` for `x1`/`y1`.
+
+#### `visualize_hue=<angle>`, `visualize_theta=<angle>`
+
+Controls the rotation of the gamut 3DLUT visualization. The `hue` parameter
+rotates the gamut through hue space (around the `I` axis), while the `theta`
+parameter vertically rotates the cross section (around the `C` axis), in
+radians. Defaults to `0.0` for both.
+
+#### `show_clipping=<yes|no>`
+
+Graphically highlight hard-clipped pixels during tone-mapping (i.e. pixels that
+exceed the claimed source luminance range). Defaults to `no`.
+
+## Dithering
+
+These options affect the way colors are dithered before output. Dithering is
+always required to avoid introducing banding artefacts as a result of
+quantization to a lower bit depth output texture.
+
+### `dither=<yes|no>`
+
+Enables dithering. Defaults to `yes`.
+
+### `dither_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `dither_method=<method>`
+
+Chooses the dithering method to use. Defaults to `blue`. The following methods
+are available:
+
+- `blue`: Dither with blue noise. Very high quality, but requires the use of a
+  LUT.
+!!! warning
+    Computing a blue noise texture with a large size can be very slow, however
+    this only needs to be performed once. Even so, using this with a
+    `dither_lut_size` greater than `6` is generally ill-advised.
+- `ordered_lut`: Dither with an ordered (bayer) dither matrix, using a LUT. Low
+  quality, and since this also uses a LUT, there's generally no advantage to
+  picking this instead of `blue`. It's mainly there for testing.
+- `ordered`: The same as `ordered`, but uses fixed function math instead of a
+  LUT. This is faster, but only supports a fixed dither matrix size of 16x16
+  (equivalent to `dither_lut_size=4`).
+- `white`: Dither with white noise. This does not require a LUT and is fairly
+  cheap to compute. Unlike the other modes it doesn't show any repeating
+  patterns either spatially or temporally, but the downside is that this is
+  visually fairly jarring due to the presence of low frequencies in the noise
+  spectrum.
+
+### `dither_lut_size=<1..8>`
+
+For the dither methods which require the use of a LUT (`blue`, `ordered_lut`),
+this controls the size of the LUT (base 2). Defaults to `6`.
+
+### `dither_temporal=<yes|no>`
+
+Enables temporal dithering. This reduces the persistence of dithering artifacts
+by perturbing the dithering matrix per frame. Defaults to `no`.
+
+!!! warning
+    This can cause nasty aliasing artifacts on some LCD screens.
+
+## Cone distortion
+
+These options can be optionally used to modulate the signal in LMS space, in
+particular, to simulate color blindiness.
+
+### `cone=<yes|no>`
+
+Enables cone distortion. Defaults to `no`.
+
+### `cone_preset=<preset>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. The following presets are available:
+
+- `normal`: No distortion (92% of population)
+- `protanomaly`: Red cone deficiency (0.66% of population)
+- `protanopia`: Red cone absence (0.59% of population)
+- `deuteranomaly`: Green cone deficiency (2.7% of population)
+- `deuteranopia`: Green cone absence (0.56% of population)
+- `tritanomaly`: Blue cone deficiency (0.01% of population)
+- `tritanopia`: Blue cone absence (0.016% of population)
+- `monochromacy`: Blue cones only (<0.001% of population)
+- `achromatopsia`: Rods only (<0.0001% of population)
+
+### `cones=<none|l|m|s|lm|ms|ls|lms>`
+
+Choose the set of cones to modulate. Defaults to `none`.
+
+### `cone_strength=<gain>`
+
+Defect/gain coefficient to apply to these cones. `1.0` = unaffected, `0.0` =
+full blindness. Defaults to `1.0`. Values above `1.0` can be used to instead
+boost the signal going to this cone. For example, to partially counteract
+deuteranomaly, you could set `cones=m`, `cone_strength=2.0`. Defaults to `0.0`.
+
+## Output blending
+
+These options affect the way the image is blended onto the output framebuffer.
+
+### `blend=<yes|no>`
+
+Enables output blending. Defaults to `no`.
+
+### `blend_preset=<alpha_overlay>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. Currently, the only preset is `alpha_overlay`, which
+corresponds to normal alpha blending.
+
+### `blend_src_rgb`, `blend_src_alpha`, `blend_dst_rgb`, `blend_dst_alpha`
+
+Choose the blending mode for each component. Defaults to `zero` for all. The
+following modes are available:
+
+- `zero`: Component will be unused.
+- `one`: Component will be added at full strength.
+- `alpha`: Component will be multiplied by the source alpha value.
+- `one_minus_alpha`: Component will be multiplied by 1 minus the source alpha.
+
+## Deinterlacing
+
+Configures the settings used to deinterlace frames, if required.
+
+!!! note
+    The use of these options requires the caller to pass extra metadata to
+    incoming frames to link them together / mark them as fields.
+
+### `deinterlace=<yes|no>`
+
+Enables deinterlacing. Defaults to `no`.
+
+### `deinterlace_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `deinterlace_algo=<algorithm>`
+
+Chooses the algorithm to use for deinterlacing. Defaults to `yadif`. The
+following algorithms are available:
+
+- `weave`: No-op deinterlacing, just sample the weaved frame un-touched.
+- `bob`: Naive bob deinterlacing. Doubles the field lines vertically.
+- `yadif`: "Yet another deinterlacing filter". Deinterlacer with temporal and
+  spatial information. Based on FFmpeg's Yadif filter algorithm, but adapted
+  slightly for the GPU.
+
+### `deinterlace_skip_spatial=<yes|no>`
+
+Skip the spatial interlacing check for `yadif`. Defaults to `no`.
+
+## Distortion
+
+The settings in this section can be used to distort/transform the output image.
+
+### `distort=<yes|no>`
+
+Enables distortion. Defaults to `no`.
+
+### `distort_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `distort_scale_x`, `distort_scale_y`
+
+Scale the image in the X/Y dimension by an arbitrary factor. Corresponds to the
+main diagonal of the transformation matrix. Defaults to `1.0` for both.
+
+### `distort_shear_x`, `distort_shear_y`
+
+Adds the X/Y dimension onto the Y/X dimension (respectively), scaled by an
+arbitrary amount. Corresponds to the anti-diagonal of the 2x2 transformation
+matrix. Defaults to `0.0` for both.
+
+### `distort_offset_x`, `distort_offset_y`
+
+Offsets the X/Y dimensions by an arbitrary offset, relative to the image size.
+Corresponds to the bottom row of a 3x3 affine transformation matrix. Defaults
+to `0.0` for both.
+
+### `distort_unscaled=<yes|no>`
+
+If enabled, the texture is placed inside the center of the canvas without
+scaling. Otherwise, it is effectively stretched to the canvas size. Defaults
+to `no`.
+
+!!! note
+    This option has no effect when using `pl_renderer`.
+
+### `distort_constrain=<yes|no>`
+
+If enabled, the transformation is automatically scaled down and shifted to
+ensure that the resulting image fits inside the output canvas. Defaults to
+`no`.
+
+### `distort_bicubic=<yes|no>`
+
+If enabled, use bicubic interpolation rather than faster bilinear
+interpolation. Higher quality but slower. Defaults to `no`.
+
+### `distort_addreess_mode=<clamp|repeat|mirror>`
+
+Specifies the texture address mode to use when sampling out of bounds. Defaults
+to `clamp`.
+
+### `distort_alpha_mode=<none|independent|premultiplied>`
+
+If set to something other than `none`, all out-of-bounds accesses will instead
+be treated as transparent, according to the given alpha mode.
+
+## Miscellaneous renderer settings
+
+### `error_diffusion=<kernel>`
+
+Enables error diffusion dithering. Error diffusion is a very slow and memory
+intensive method of dithering without the use of a fixed dither pattern. If
+set, this will be used instead of `dither_method` whenever possible. It's
+highly recommended to use this only for still images, not moving video.
+Defaults to `none`. The following options are available:
+
+- `simple`: Simple error diffusion (fast)
+- `false-fs`: False Floyd-Steinberg kernel (fast)
+- `sierra-lite`: Sierra Lite kernel (slow)
+- `floyd-steinberg`: Floyd-Steinberg kernel (slow)
+- `atkinson`: Atkinson kernel (slow)
+- `jarvis-judice-ninke`: Jarvis, Judice & Ninke kernel (very slow)
+- `stucki`: Stucki kernel (very slow)
+- `burkes`: Burkes kernel (very slow)
+- `sierra-2`: Two-row Sierra (very slow)
+- `sierra-3`: Three-row Sierra (very slow)
+
+### `lut_type=<type>`
+
+Overrides the color mapping LUT type. Defaults to `unknown`. The following
+options are available:
+
+- `unknown`: Unknown LUT type, try and guess from metadata
+- `native`: LUT is applied to raw image contents
+- `normalized`: LUT is applied to normalized (HDR) RGB values
+- `conversion`: LUT fully replaces color conversion step
+
+!!! note
+    There is no way to load LUTs via the options mechanism, so this option only
+    has an effect if the LUT is loaded via external means.
+
+### `background_r=<0.0..1.0>`, `background_g=<0.0..1.0>`, `background_b=<0.0..1.0>`
+
+If the image being rendered does not span the entire size of the target, it
+will be cleared explicitly using this background color (RGB). Defaults to `0.0`
+for all.
+
+### `background_transparency=<0.0..1.0>`
+
+The (inverted) alpha value of the background clear color. Defaults to `0.0`.
+
+### `skip_target_clearing=<yes|no>`
+
+If set, skips clearing the background backbuffer entirely. Defaults to `no`.
+
+!!! note
+    This is automatically skipped if the image to be rendered would completely
+    cover the backbuffer.
+
+### `corner_rounding=<0.0..1.0>`
+
+If set to a value above `0.0`, the output will be rendered with rounded
+corners, as if an alpha transparency mask had been applied. The value indicates
+the relative fraction of the side length to round - a value of `1.0` rounds the
+corners as much as possible. Defaults to `0.0`.
+
+### `blend_against_tiles=<yes|no>`
+
+If true, then transparent images will made opaque by painting them against a
+checkerboard pattern consisting of alternating colors. Defaults to `no`.
+
+### `tile_color_hi_r`, `tile_color_hi_g`, `tile_color_hi_b`, `tile_color_lo_r`, `tile_color_lo_g`, `tile_color_l_b`
+
+The colors of the light/dark tiles used for `blend_against_tiles`. Defaults to
+`0.93` for light R/G/B and `0.87` for dark R/G/B, respectively.
+
+### `tile_size=<2..256>`
+
+The size, in output pixels, of the tiles used for `blend_against_tiles`.
+Defaults to `32`.
+
+## Performance / quality trade-offs
+
+These should generally be left off where quality is desired, as they can
+degrade the result quite noticeably; but may be useful for older or slower
+hardware. Note that libplacebo will automatically disable advanced features on
+hardware where they are unsupported, regardless of these settings. So only
+enable them if you need a performance bump.
+
+### `skip_anti_aliasing=<yes|no>`
+
+Disables anti-aliasing on downscaling. This will result in moiré artifacts and
+nasty, jagged pixels when downscaling, except for some very limited special
+cases (e.g. bilinear downsampling to exactly 0.5x). Significantly speeds up
+downscaling with high downscaling ratios. Defaults to `no`.
+
+### `preserve_mixing_cache=<yes|no>`
+
+Normally, when the size of the target framebuffer changes, or the render
+parameters are updated, the internal cache of mixed frames must be discarded in
+order to re-render all required frames. Setting this option to `yes` will skip
+the cache invalidation and instead re-use the existing frames (with bilinear
+scaling to the new size if necessary). This comes at a hefty quality loss
+shortly after a resize, but should make it much more smooth. Defaults to `no`.
+
+## Debugging, tuning and testing
+
+These may affect performance or may make debugging problems easier, but
+shouldn't have any effect on the quality (except where otherwise noted).
+
+### `skip_caching_single_frame=<yes|no>`
+
+Normally, single frames will also get pushed through the mixer cache, in order
+to speed up re-draws. Enabling this option disables that logic, causing single
+frames to bypass being written to the cache. Defaults to `no`.
+
+!!! note
+    If a frame is *already* cached, it will be re-used, regardless.
+
+### `disable_linear_scaling=<yes|no>`
+
+Disables linearization / sigmoidization before scaling. This might be useful
+when tracking down unexpected image artifacts or excessing ringing, but it
+shouldn't normally be necessary. Defaults to `no`.
+
+### `disable_builtin_scalers=<yes|no>`
+
+Forces the use of the slower, "general" scaling algorithms even when faster
+built-in replacements exist. Defaults to `no`.
+
+### `correct_subpixel_offsets=<yes|no>`
+
+Forces correction of subpixel offsets (using the configured `upscaler`).
+Defaults to `no`.
+
+!!! warning
+    Enabling this may cause such images to get noticeably blurrier, especially
+    when using a polar scaler. It's not generally recommended to enable this.
+
+### `force_dither=<yes|no>`
+
+Forces the use of dithering, even when rendering to 16-bit FBOs. This is
+generally pretty pointless because most 16-bit FBOs have high enough depth that
+rounding errors are below the human perception threshold, but this can be used
+to test the dither code. Defaults to `no`.
+
+### `disable_dither_gamma_correction=<yes|no>`
+
+Disables the gamma-correct dithering logic which normally applies when
+dithering to low bit depths. No real use, outside of testing. Defaults to `no`.
+
+### `disable_fbos=<yes|no>`
+
+Completely overrides the use of FBOs, as if there were no renderable texture
+format available. This disables most features. Defaults to `no`.
+
+### `force_low_bit_depth_fbos=<yes|no>`
+
+Use only low-bit-depth FBOs (8 bits). Note that this also implies disabling
+linear scaling and sigmoidization. Defaults to `no`.
+
+### `dynamic_constants=<yes|no>`
+
+If this is enabled, all shaders will be generated as "dynamic" shaders, with
+any compile-time constants being replaced by runtime-adjustable values. This is
+generally a performance loss, but has the advantage of being able to freely
+change parameters without triggering shader recompilations. It's a good idea to
+enable this if you will change these options very frequently, but it should be
+disabled once those values are "dialed in". Defaults to `no`.
diff --git a/docs/renderer.md b/docs/renderer.md
new file mode 100644
index 0000000..3104b0d
--- /dev/null
+++ b/docs/renderer.md
@@ -0,0 +1,302 @@
+# Rendering content: pl_frame, pl_renderer, and pl_queue
+
+This example roughly builds off the [previous entry](./basic-rendering.md),
+and as such will not cover the basics of how to create a window, initialize a
+`pl_gpu` and get pixels onto the screen.
+
+## Renderer
+
+The `pl_renderer` set of APIs represents the highest-level interface into
+libplacebo, and is what most users who simply want to display e.g. a video
+feed on-screen will want to be using.
+
+The basic initialization is straightforward, requiring no extra parameters:
+
+``` c linenums="1"
+pl_renderer renderer;
+
+init()
+{
+    renderer = pl_renderer_create(pllog, gpu);
+    if (!renderer)
+        goto error;
+
+    // ...
+}
+
+uninit()
+{
+    pl_renderer_destroy(&renderer);
+}
+```
+
+What makes the renderer powerful is the large number of `pl_render_params` it
+exposes. By default, libplacebo provides several presets to use:
+
+* **pl_render_fast_params**: Disables everything except for defaults. This is
+  the fastest possible configuration.
+* **pl_render_default_params**: Contains the recommended default parameters,
+  including some slightly higher quality scaling, as well as dithering.
+* **pl_render_high_quality_params**: A preset of reasonable defaults for a
+  higher-end machine (i.e. anything with a discrete GPU). This enables most
+  of the basic functionality, including upscaling, downscaling, debanding
+  and better HDR tone mapping.
+
+Covering all of the possible options exposed by `pl_render_params` is
+out-of-scope of this example and would be better served by looking at [the API
+documentation](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/renderer.h#L94).
+
+### Frames
+
+[`pl_frame`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/renderer.h#L503)
+is the struct libplacebo uses to group textures and their metadata together
+into a coherent unit that can be rendered using the renderer. This is not
+currently a dynamically allocated or refcounted heap object, it is merely a
+struct that can live on the stack (or anywhere else). The actual data lives in
+corresponding `pl_tex` objects referenced in each of the frame's planes.
+
+``` c linenums="1"
+bool render_frame(const struct pl_frame *image,
+                  const struct pl_swapchain_frame *swframe)
+{
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, swframe);
+
+    return pl_render_image(renderer, image, target,
+                           &pl_render_default_params);
+}
+```
+
+!!! note "Renderer state"
+    The `pl_renderer` is conceptually (almost) stateless. The only thing that
+    is needed to get a different result is to change the render params, which
+    can be varied freely on every call, if the user desires.
+
+    The one case where this is not entirely true is when using frame mixing
+    (see below), or when using HDR peak detection. In this case, the renderer
+    can be explicitly reset using `pl_renderer_flush_cache`.
+
+To upload frames, the easiest methods are made available as dedicated helpers
+in
+[`<libplacebo/utils/upload.h>`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/utils/upload.h),
+and
+[`<libplacebo/utils/libav.h>`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/utils/libav.h)
+(for AVFrames). In general, I recommend checking out the [demo
+programs](https://code.videolan.org/videolan/libplacebo/-/tree/master/demos)
+for a clearer illustration of how to use them in practice.
+
+### Shader cache
+
+The renderer internally generates, compiles and caches a potentially large
+number of shader programs, some of which can be complex. On some platforms
+(notably D3D11), these can be quite costly to recompile on every program
+launch.
+
+As such, the renderer offers a way to save/restore its internal shader cache
+from some external location (managed by the API user). The use of this API is
+highly recommended:
+
+``` c linenums="1" hl_lines="1-2 10-14 21-27"
+static uint8_t *load_saved_cache();
+static void store_saved_cache(uint8_t *cache, size_t bytes);
+
+void init()
+{
+    renderer = pl_renderer_create(pllog, gpu);
+    if (!renderer)
+        goto error;
+
+    uint8_t *cache = load_saved_cache();
+    if (cache) {
+        pl_renderer_load(renderer, cache);
+        free(cache);
+    }
+
+    // ...
+}
+
+void uninit()
+{
+    size_t cache_bytes = pl_renderer_save(renderer, NULL);
+    uint8_t *cache = malloc(cache_bytes);
+    if (cache) {
+        pl_renderer_save(renderer, cache);
+        store_saved_cache(cache, cache_bytes);
+        free(cache);
+    }
+
+    pl_renderer_destroy(&renderer);
+}
+```
+
+!!! warning "Cache safety"
+    libplacebo performs only minimal validity checking on the shader cache,
+    and in general, cannot possibly guard against malicious alteration of such
+    files. Loading a cache from an untrusted source represents a remote code
+    execution vector.
+
+## Frame mixing
+
+One of the renderer's most powerful features is its ability to compensate
+for differences in framerates between the source and display by using [frame
+mixing](https://github.com/mpv-player/mpv/wiki/Interpolation) to blend
+adjacent frames together.
+
+Using this API requires presenting the renderer, at each vsync, with a
+`pl_frame_mix` struct, describing the current state of the vsync. In
+principle, such structs can be constructed by hand. To do this, all of the
+relevant frames (nearby the vsync timestamp) must be collected, and their
+relative distances to the vsync determined, by normalizing all PTS values such
+that the vsync represents time `0.0` (and a distance of `1.0` represents the
+nominal duration between adjacent frames). Note that timing vsyncs, and
+determining the correct vsync duration, are both left as problems for the user
+to solve.[^timing]. Here could be an example of a valid struct:
+
+[^timing]: However, this may change in the future, as the recent introduction of
+  the Vulkan display timing extension may result in display timing feedback
+  being added to the `pl_swapchain` API. That said, as of writing, this has
+  not yet happened.
+
+``` c
+(struct pl_frame_mix) {
+    .num_frames = 6
+    .frames = (const struct pl_frame *[]) {
+        /* frame 0 */
+        /* frame 1 */
+        /* ... */
+        /* frame 5 */
+    },
+    .signatures = (uint64_t[]) {
+        0x0, 0x1, 0x2, 0x3, 0x4, 0x5 // (1)
+    },
+    .timestamps = (float[]) {
+        -2.4, -1.4, -0.4, 0.6, 1.6, 2.6, // (2)
+    },
+    .vsync_duration = 0.4, // 24 fps video on 60 fps display
+}
+```
+
+1.  These must be unique per frame, but always refer to the same frame. For
+    example, this could be based on the frame's PTS, the frame's numerical ID
+    (in order of decoding), or some sort of hash. The details don't matter,
+    only that this uniquely identifies specific frames.
+
+2.  Typically, for CFR sources, frame timestamps will always be separated in
+    this list by a distance of 1.0. In this example, the vsync falls roughly
+    halfway (but not quite) in between two adjacent frames (with IDs 0x2 and
+    0x3).
+
+!!! note "Frame mixing radius"
+    In this example, the frame mixing radius (as determined by
+    `pl_frame_mix_radius` is `3.0`, so we include all frames that fall within
+    the timestamp interval of `[-3, 3)`. In general, you should consult this
+    function to determine what frames need to be included in the
+    `pl_frame_mix` - though including more frames than needed is not an error.
+
+### Frame queue
+
+Because this API is rather unwieldy and clumsy to use directly, libplacebo
+provides a helper abstraction known as `pl_queue` to assist in transforming
+some arbitrary source of frames (such as a video decoder) into nicely packed
+`pl_frame_mix` structs ready for consumption by the `pl_renderer`:
+
+``` c linenums="1"
+#include <libplacebo/utils/frame_queue.h>
+
+pl_queue queue;
+
+void init()
+{
+    queue = pl_queue_create(gpu);
+}
+
+void uninit()
+{
+    pl_queue_destroy(&queue);
+    // ...
+}
+```
+
+This queue can be interacted with through a number of mechanisms: either
+pushing frames (blocking or non-blocking), or by having the queue poll frames
+(via blocking or non-blocking callback) as-needed. For a full overview of the
+various methods of pushing and polling frames, check the [API
+documentation](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/utils/frame_queue.h#L115).
+
+In this example, I will assume that we have a separate decoder thread pushing
+frames into the `pl_queue` in a blocking manner:
+
+``` c linenums="1"
+static void decoder_thread(void)
+{
+    void *frame;
+
+    while ((frame = /* decode new frame */)) {
+        pl_queue_push_block(queue, UINT64_MAX, &(struct pl_source_frame) {
+            .pts        = /* frame pts */,
+            .duration   = /* frame duration */,
+            .map        = /* map callback */,
+            .unmap      = /* unmap callback */,
+            .frame_data = frame,
+        });
+    }
+
+    pl_queue_push(queue, NULL); // signal EOF
+}
+```
+
+Now, in our render loop, we want to call `pl_queue_update` with appropriate
+values to retrieve the correct frame mix for each vsync:
+
+``` c linenums="1" hl_lines="3-10 12-21 27"
+bool render_frame(const struct pl_swapchain_frame *swframe)
+{
+    struct pl_frame_mix mix;
+    enum pl_queue_status res;
+    res = pl_queue_update(queue, &mix, pl_queue_params(
+        .pts            = /* time of next vsync */,
+        .radius         = pl_frame_mix_radius(&render_params),
+        .vsync_duration = /* if known */,
+        .timeout        = UINT64_MAX, // (2)
+    ));
+
+    switch (res) {
+    case PL_QUEUE_OK:
+        break;
+    case PL_QUEUE_EOF:
+        /* no more frames */
+        return false;
+    case PL_QUEUE_ERR:
+        goto error;
+    // (1)
+    }
+
+
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, swframe);
+
+    return pl_render_image_mix(renderer, &mix, target,
+                               &pl_render_default_params);
+}
+```
+
+1.  There is a fourth status, `PL_QUEUE_MORE`, which is returned only if the
+    resulting frame mix is incomplete (and the timeout was reached) -
+    basically this can only happen if the queue runs dry due to frames not
+    being supplied fast enough.
+
+    In this example, since we are setting `timeout` to `UINT64_MAX`, we will
+    never get this return value.
+
+2.  Setting this makes `pl_queue_update` block indefinitely until sufficiently
+    many frames have been pushed into the `pl_queue` from our separate
+    decoding thread.
+
+### Deinterlacing
+
+The frame queue also vastly simplifies the process of performing
+motion-adaptive temporal deinterlacing, by automatically linking together
+adjacent fields/frames. To take advantage of this, all you need to do is set
+the appropriate field (`pl_source_frame.first_frame`), as well as enabling
+[deinterlacing
+parameters](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/renderer.h#L186).
diff --git a/docs/style.css b/docs/style.css
new file mode 100644
index 0000000..81ed8a8
--- /dev/null
+++ b/docs/style.css
@@ -0,0 +1,3 @@
+.md-typeset p {
+    margin: 1em 1em;
+}